|
|
|
|
|
import re |
|
import unicodedata |
|
|
|
from transformers import AutoTokenizer |
|
|
|
from .. import punctuation, symbols |
|
|
|
from num2words import num2words |
|
|
|
import pyopenjtalk |
|
import jaconv |
|
|
|
|
|
def kata2phoneme(text: str) -> str: |
|
"""Convert katakana text to phonemes.""" |
|
text = text.strip() |
|
if text == "ー": |
|
return ["ー"] |
|
elif text.startswith("ー"): |
|
return ["ー"] + kata2phoneme(text[1:]) |
|
res = [] |
|
prev = None |
|
while text: |
|
if re.match(_MARKS, text): |
|
res.append(text) |
|
text = text[1:] |
|
continue |
|
if text.startswith("ー"): |
|
if prev: |
|
res.append(prev[-1]) |
|
text = text[1:] |
|
continue |
|
res += pyopenjtalk.g2p(text).lower().replace("cl", "q").split(" ") |
|
break |
|
|
|
return res |
|
|
|
|
|
def hira2kata(text: str) -> str: |
|
return jaconv.hira2kata(text) |
|
|
|
|
|
_SYMBOL_TOKENS = set(list("・、。?!")) |
|
_NO_YOMI_TOKENS = set(list("「」『』―()[][]")) |
|
_MARKS = re.compile( |
|
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]" |
|
) |
|
|
|
|
|
def text2kata(text: str) -> str: |
|
parsed = pyopenjtalk.run_frontend(text) |
|
|
|
res = [] |
|
for parts in parsed: |
|
word, yomi = replace_punctuation(parts["orig"]), parts["pron"].replace("’", "") |
|
if yomi: |
|
if re.match(_MARKS, yomi): |
|
if len(word) > 1: |
|
word = [replace_punctuation(i) for i in list(word)] |
|
yomi = word |
|
res += yomi |
|
sep += word |
|
continue |
|
elif word not in rep_map.keys() and word not in rep_map.values(): |
|
word = "," |
|
yomi = word |
|
res.append(yomi) |
|
else: |
|
if word in _SYMBOL_TOKENS: |
|
res.append(word) |
|
elif word in ("っ", "ッ"): |
|
res.append("ッ") |
|
elif word in _NO_YOMI_TOKENS: |
|
pass |
|
else: |
|
res.append(word) |
|
return hira2kata("".join(res)) |
|
|
|
|
|
def text2sep_kata(text: str) -> (list, list): |
|
parsed = pyopenjtalk.run_frontend(text) |
|
|
|
res = [] |
|
sep = [] |
|
for parts in parsed: |
|
word, yomi = replace_punctuation(parts["orig"]), parts["pron"].replace("’", "") |
|
if yomi: |
|
if re.match(_MARKS, yomi): |
|
if len(word) > 1: |
|
word = [replace_punctuation(i) for i in list(word)] |
|
yomi = word |
|
res += yomi |
|
sep += word |
|
continue |
|
elif word not in rep_map.keys() and word not in rep_map.values(): |
|
word = "," |
|
yomi = word |
|
res.append(yomi) |
|
else: |
|
if word in _SYMBOL_TOKENS: |
|
res.append(word) |
|
elif word in ("っ", "ッ"): |
|
res.append("ッ") |
|
elif word in _NO_YOMI_TOKENS: |
|
pass |
|
else: |
|
res.append(word) |
|
sep.append(word) |
|
return sep, [hira2kata(i) for i in res] |
|
|
|
|
|
_ALPHASYMBOL_YOMI = { |
|
"#": "シャープ", |
|
"%": "パーセント", |
|
"&": "アンド", |
|
"+": "プラス", |
|
"-": "マイナス", |
|
":": "コロン", |
|
";": "セミコロン", |
|
"<": "小なり", |
|
"=": "イコール", |
|
">": "大なり", |
|
"@": "アット", |
|
"a": "エー", |
|
"b": "ビー", |
|
"c": "シー", |
|
"d": "ディー", |
|
"e": "イー", |
|
"f": "エフ", |
|
"g": "ジー", |
|
"h": "エイチ", |
|
"i": "アイ", |
|
"j": "ジェー", |
|
"k": "ケー", |
|
"l": "エル", |
|
"m": "エム", |
|
"n": "エヌ", |
|
"o": "オー", |
|
"p": "ピー", |
|
"q": "キュー", |
|
"r": "アール", |
|
"s": "エス", |
|
"t": "ティー", |
|
"u": "ユー", |
|
"v": "ブイ", |
|
"w": "ダブリュー", |
|
"x": "エックス", |
|
"y": "ワイ", |
|
"z": "ゼット", |
|
"α": "アルファ", |
|
"β": "ベータ", |
|
"γ": "ガンマ", |
|
"δ": "デルタ", |
|
"ε": "イプシロン", |
|
"ζ": "ゼータ", |
|
"η": "イータ", |
|
"θ": "シータ", |
|
"ι": "イオタ", |
|
"κ": "カッパ", |
|
"λ": "ラムダ", |
|
"μ": "ミュー", |
|
"ν": "ニュー", |
|
"ξ": "クサイ", |
|
"ο": "オミクロン", |
|
"π": "パイ", |
|
"ρ": "ロー", |
|
"σ": "シグマ", |
|
"τ": "タウ", |
|
"υ": "ウプシロン", |
|
"φ": "ファイ", |
|
"χ": "カイ", |
|
"ψ": "プサイ", |
|
"ω": "オメガ", |
|
} |
|
|
|
|
|
_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+") |
|
_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"} |
|
_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])") |
|
_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?") |
|
|
|
|
|
def japanese_convert_numbers_to_words(text: str) -> str: |
|
res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text) |
|
res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res) |
|
res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res) |
|
return res |
|
|
|
|
|
def japanese_convert_alpha_symbols_to_words(text: str) -> str: |
|
return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()]) |
|
|
|
|
|
def japanese_text_to_phonemes(text: str) -> str: |
|
"""Convert Japanese text to phonemes.""" |
|
res = unicodedata.normalize("NFKC", text) |
|
res = japanese_convert_numbers_to_words(res) |
|
|
|
res = text2kata(res) |
|
res = kata2phoneme(res) |
|
return res |
|
|
|
|
|
def is_japanese_character(char): |
|
|
|
japanese_ranges = [ |
|
(0x3040, 0x309F), |
|
(0x30A0, 0x30FF), |
|
(0x4E00, 0x9FFF), |
|
(0x3400, 0x4DBF), |
|
(0x20000, 0x2A6DF), |
|
|
|
] |
|
|
|
|
|
char_code = ord(char) |
|
|
|
|
|
for start, end in japanese_ranges: |
|
if start <= char_code <= end: |
|
return True |
|
|
|
return False |
|
|
|
|
|
rep_map = { |
|
":": ",", |
|
";": ",", |
|
",": ",", |
|
"。": ".", |
|
"!": "!", |
|
"?": "?", |
|
"\n": ".", |
|
".": ".", |
|
"...": "…", |
|
"···": "…", |
|
"・・・": "…", |
|
"·": ",", |
|
"・": ",", |
|
"、": ",", |
|
"$": ".", |
|
"“": "'", |
|
"”": "'", |
|
"‘": "'", |
|
"’": "'", |
|
"(": "'", |
|
")": "'", |
|
"(": "'", |
|
")": "'", |
|
"《": "'", |
|
"》": "'", |
|
"【": "'", |
|
"】": "'", |
|
"[": "'", |
|
"]": "'", |
|
"—": "-", |
|
"−": "-", |
|
"~": "-", |
|
"~": "-", |
|
"「": "'", |
|
"」": "'", |
|
} |
|
|
|
|
|
def replace_punctuation(text): |
|
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) |
|
|
|
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) |
|
|
|
replaced_text = re.sub( |
|
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF" |
|
+ "".join(punctuation) |
|
+ r"]+", |
|
"", |
|
replaced_text, |
|
) |
|
|
|
return replaced_text |
|
|
|
|
|
def text_normalize(text): |
|
res = unicodedata.normalize("NFKC", text) |
|
res = japanese_convert_numbers_to_words(res) |
|
|
|
res = replace_punctuation(res) |
|
return res |
|
|
|
|
|
def distribute_phone(n_phone, n_word): |
|
phones_per_word = [0] * n_word |
|
for task in range(n_phone): |
|
min_tasks = min(phones_per_word) |
|
min_index = phones_per_word.index(min_tasks) |
|
phones_per_word[min_index] += 1 |
|
return phones_per_word |
|
|
|
|
|
def handle_long(sep_phonemes): |
|
for i in range(len(sep_phonemes)): |
|
if sep_phonemes[i][0] == "ー": |
|
sep_phonemes[i][0] = sep_phonemes[i - 1][-1] |
|
if "ー" in sep_phonemes[i]: |
|
for j in range(len(sep_phonemes[i])): |
|
if sep_phonemes[i][j] == "ー": |
|
sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1] |
|
return sep_phonemes |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") |
|
|
|
|
|
def g2p(norm_text): |
|
sep_text, sep_kata = text2sep_kata(norm_text) |
|
sep_tokenized = [tokenizer.tokenize(i) for i in sep_text] |
|
sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata]) |
|
|
|
for i in sep_phonemes: |
|
for j in i: |
|
assert j in symbols, (sep_text, sep_kata, sep_phonemes) |
|
|
|
word2ph = [] |
|
for token, phoneme in zip(sep_tokenized, sep_phonemes): |
|
phone_len = len(phoneme) |
|
word_len = len(token) |
|
|
|
aaa = distribute_phone(phone_len, word_len) |
|
word2ph += aaa |
|
phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"] |
|
tones = [0 for i in phones] |
|
word2ph = [1] + word2ph + [1] |
|
return phones, tones, word2ph |
|
|
|
|
|
if __name__ == "__main__": |
|
tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3") |
|
text = "hello,こんにちは、世界ー!……" |
|
from text.japanese_bert import get_bert_feature |
|
|
|
text = text_normalize(text) |
|
print(text) |
|
|
|
phones, tones, word2ph = g2p(text) |
|
bert = get_bert_feature(text, word2ph) |
|
|
|
print(phones, tones, word2ph, bert.shape) |
|
|