File size: 5,168 Bytes
03204e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import re
from unidecode import unidecode
import pyopenjtalk
# Regular expression matching Japanese without punctuation marks:
_japanese_characters = re.compile(
r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
# Regular expression matching non-Japanese characters or punctuation marks:
_japanese_marks = re.compile(
r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
# List of (symbol, Japanese) pairs for marks:
_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
('%', 'パーセント')
]]
# List of (romaji, ipa) pairs for marks:
_romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
('ts', 'ʦ'),
('u', 'ɯ'),
('j', 'ʥ'),
('y', 'j'),
('ni', 'n^i'),
('nj', 'n^'),
('hi', 'çi'),
('hj', 'ç'),
('f', 'ɸ'),
('I', 'i*'),
('U', 'ɯ*'),
('r', 'ɾ')
]]
# List of (romaji, ipa2) pairs for marks:
_romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
('u', 'ɯ'),
('ʧ', 'tʃ'),
('j', 'dʑ'),
('y', 'j'),
('ni', 'n^i'),
('nj', 'n^'),
('hi', 'çi'),
('hj', 'ç'),
('f', 'ɸ'),
('I', 'i*'),
('U', 'ɯ*'),
('r', 'ɾ')
]]
# List of (consonant, sokuon) pairs:
_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
(r'Q([↑↓]*[kg])', r'k#\1'),
(r'Q([↑↓]*[tdjʧ])', r't#\1'),
(r'Q([↑↓]*[sʃ])', r's\1'),
(r'Q([↑↓]*[pb])', r'p#\1')
]]
# List of (consonant, hatsuon) pairs:
_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
(r'N([↑↓]*[pbm])', r'm\1'),
(r'N([↑↓]*[ʧʥj])', r'n^\1'),
(r'N([↑↓]*[tdn])', r'n\1'),
(r'N([↑↓]*[kg])', r'ŋ\1')
]]
def symbols_to_japanese(text):
for regex, replacement in _symbols_to_japanese:
text = re.sub(regex, replacement, text)
return text
def japanese_to_romaji_with_accent(text):
'''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
text = symbols_to_japanese(text)
sentences = re.split(_japanese_marks, text)
marks = re.findall(_japanese_marks, text)
text = ''
for i, sentence in enumerate(sentences):
if re.match(_japanese_characters, sentence):
if text != '':
text += ' '
labels = pyopenjtalk.extract_fullcontext(sentence)
for n, label in enumerate(labels):
phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
if phoneme not in ['sil', 'pau']:
text += phoneme.replace('ch', 'ʧ').replace('sh',
'ʃ').replace('cl', 'Q')
else:
continue
# n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
a2 = int(re.search(r"\+(\d+)\+", label).group(1))
a3 = int(re.search(r"\+(\d+)/", label).group(1))
if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
a2_next = -1
else:
a2_next = int(
re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
# Accent phrase boundary
if a3 == 1 and a2_next == 1:
text += ' '
# Falling
elif a1 == 0 and a2_next == a2 + 1:
text += '↓'
# Rising
elif a2 == 1 and a2_next == 2:
text += '↑'
if i < len(marks):
text += unidecode(marks[i]).replace(' ', '')
return text
def get_real_sokuon(text):
for regex, replacement in _real_sokuon:
text = re.sub(regex, replacement, text)
return text
def get_real_hatsuon(text):
for regex, replacement in _real_hatsuon:
text = re.sub(regex, replacement, text)
return text
def japanese_to_ipa(text):
text = japanese_to_romaji_with_accent(text).replace('...', '…')
text = re.sub(
r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
text = get_real_sokuon(text)
text = get_real_hatsuon(text)
for regex, replacement in _romaji_to_ipa:
text = re.sub(regex, replacement, text)
return text
def japanese_to_ipa2(text):
text = japanese_to_romaji_with_accent(text).replace('...', '…')
text = get_real_sokuon(text)
text = get_real_hatsuon(text)
for regex, replacement in _romaji_to_ipa2:
text = re.sub(regex, replacement, text)
return text
def japanese_to_ipa3(text):
text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
text = re.sub(
r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
return text
|