Spaces:
Running
Running
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import json | |
import numpy as np | |
import os | |
import torch | |
import copy | |
from g2p_en import G2p | |
import re | |
import unicodedata | |
from g2p_en import G2p | |
from g2p_en.expand import normalize_numbers | |
g2p = G2p() | |
PHONE_SET = [ | |
"!", | |
",", | |
".", | |
".B", | |
":", | |
"<BOS>", | |
"<EOS>", | |
"<PAD>", | |
"<UNK>", | |
"?", | |
"AA0B", | |
"AA0E", | |
"AA0I", | |
"AA1B", | |
"AA1E", | |
"AA1I", | |
"AA2B", | |
"AA2E", | |
"AA2I", | |
"AE0B", | |
"AE0E", | |
"AE0I", | |
"AE1B", | |
"AE1E", | |
"AE1I", | |
"AE2B", | |
"AE2E", | |
"AE2I", | |
"AH0B", | |
"AH0E", | |
"AH0I", | |
"AH1B", | |
"AH1E", | |
"AH1I", | |
"AH2B", | |
"AH2E", | |
"AH2I", | |
"AO0B", | |
"AO0E", | |
"AO0I", | |
"AO1", | |
"AO1B", | |
"AO1E", | |
"AO1I", | |
"AO2B", | |
"AO2E", | |
"AO2I", | |
"AW0B", | |
"AW0E", | |
"AW0I", | |
"AW1B", | |
"AW1E", | |
"AW1I", | |
"AW2B", | |
"AW2E", | |
"AW2I", | |
"AY0B", | |
"AY0E", | |
"AY0I", | |
"AY1B", | |
"AY1E", | |
"AY1I", | |
"AY2B", | |
"AY2E", | |
"AY2I", | |
"BB", | |
"BE", | |
"BI", | |
"CHB", | |
"CHE", | |
"CHI", | |
"DB", | |
"DE", | |
"DHB", | |
"DHE", | |
"DHI", | |
"DI", | |
"EH0B", | |
"EH0E", | |
"EH0I", | |
"EH1B", | |
"EH1E", | |
"EH1I", | |
"EH2B", | |
"EH2E", | |
"EH2I", | |
"ER0B", | |
"ER0E", | |
"ER0I", | |
"ER1B", | |
"ER1E", | |
"ER1I", | |
"ER2B", | |
"ER2E", | |
"ER2I", | |
"EY0B", | |
"EY0E", | |
"EY0I", | |
"EY1B", | |
"EY1E", | |
"EY1I", | |
"EY2B", | |
"EY2E", | |
"EY2I", | |
"FB", | |
"FE", | |
"FI", | |
"GB", | |
"GE", | |
"GI", | |
"HHB", | |
"HHE", | |
"HHI", | |
"IH0B", | |
"IH0E", | |
"IH0I", | |
"IH1B", | |
"IH1E", | |
"IH1I", | |
"IH2B", | |
"IH2E", | |
"IH2I", | |
"IY0B", | |
"IY0E", | |
"IY0I", | |
"IY1B", | |
"IY1E", | |
"IY1I", | |
"IY2B", | |
"IY2E", | |
"IY2I", | |
"JHB", | |
"JHE", | |
"JHI", | |
"KB", | |
"KE", | |
"KI", | |
"L", | |
"LB", | |
"LE", | |
"LI", | |
"MB", | |
"ME", | |
"MI", | |
"NB", | |
"NE", | |
"NGB", | |
"NGE", | |
"NGI", | |
"NI", | |
"OW0B", | |
"OW0E", | |
"OW0I", | |
"OW1B", | |
"OW1E", | |
"OW1I", | |
"OW2B", | |
"OW2E", | |
"OW2I", | |
"OY0B", | |
"OY0E", | |
"OY0I", | |
"OY1B", | |
"OY1E", | |
"OY1I", | |
"OY2B", | |
"OY2E", | |
"OY2I", | |
"PB", | |
"PE", | |
"PI", | |
"RB", | |
"RE", | |
"RI", | |
"SB", | |
"SE", | |
"SHB", | |
"SHE", | |
"SHI", | |
"SI", | |
"TB", | |
"TE", | |
"THB", | |
"THE", | |
"THI", | |
"TI", | |
"UH0B", | |
"UH0E", | |
"UH0I", | |
"UH1B", | |
"UH2B", | |
"UH1E", | |
"UH1I", | |
"UH2E", | |
"UH2I", | |
"UW0B", | |
"UW0E", | |
"UW0I", | |
"UW1B", | |
"UW1E", | |
"UW1I", | |
"UW2B", | |
"UW2E", | |
"UW2I", | |
"VB", | |
"VE", | |
"VI", | |
"WB", | |
"WE", | |
"WI", | |
"YB", | |
"YE", | |
"YI", | |
"ZB", | |
"ZE", | |
"ZHB", | |
"ZHE", | |
"ZHI", | |
"ZI", | |
"|", | |
] | |
PHPONE2ID = {PHONE_SET[i]: i for i in range(len(PHONE_SET))} | |
PUNCS = "!,.?;:" | |
def is_sil_phoneme(p): | |
return p == "" or not p[0].isalpha() | |
def add_bdr(txt_struct): | |
txt_struct_ = [] | |
for i, ts in enumerate(txt_struct): | |
txt_struct_.append(ts) | |
if ( | |
i != len(txt_struct) - 1 | |
and not is_sil_phoneme(txt_struct[i][0]) | |
and not is_sil_phoneme(txt_struct[i + 1][0]) | |
): | |
txt_struct_.append(["|", ["|"]]) | |
return txt_struct_ | |
def preprocess_text(text): | |
text = normalize_numbers(text) | |
text = "".join( | |
char | |
for char in unicodedata.normalize("NFD", text) | |
if unicodedata.category(char) != "Mn" | |
) # Strip accents | |
text = text.lower() | |
text = re.sub("['\"()]+", "", text) | |
text = re.sub("[-]+", " ", text) | |
text = re.sub(f"[^ a-z{PUNCS}]", "", text) | |
text = re.sub(f" ?([{PUNCS}]) ?", r"\1", text) # !! -> ! | |
text = re.sub(f"([{PUNCS}])+", r"\1", text) # !! -> ! | |
text = text.replace("i.e.", "that is") | |
text = text.replace("i.e.", "that is") | |
text = text.replace("etc.", "etc") | |
text = re.sub(f"([{PUNCS}])", r" ", text) # remove punctuations for now | |
text = re.sub(rf"\s+", r" ", text) | |
return text | |
def postprocess(txt_struct): | |
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[0][0]): | |
txt_struct = txt_struct[1:] | |
while len(txt_struct) > 0 and is_sil_phoneme(txt_struct[-1][0]): | |
txt_struct = txt_struct[:-1] | |
txt_struct = add_bdr(txt_struct) | |
txt_struct = [["<BOS>", ["<BOS>"]]] + txt_struct + [["<EOS>", ["<EOS>"]]] | |
return txt_struct | |
def process(txt, g2p): | |
txt = preprocess_text(txt).strip() | |
phs = g2p(txt) | |
txt_struct = [[w, []] for w in txt.split(" ")] | |
i_word = 0 | |
for p in phs: | |
if p == " ": | |
i_word += 1 | |
else: | |
txt_struct[i_word][1].append(p) | |
txt_struct_ret = copy.deepcopy(txt_struct) | |
for i_word in range(len(txt_struct)): | |
if not is_sil_phoneme(txt_struct[i_word][0]): | |
if len(txt_struct[i_word][1]) > 1: | |
txt_struct_ret[i_word][1][0] += "B" | |
for i in range(1, len(txt_struct[i_word][1]) - 1): | |
txt_struct_ret[i_word][1][i] += "I" | |
txt_struct_ret[i_word][1][-1] += "E" | |
else: | |
txt_struct_ret[i_word][1][0] += "B" | |
txt_struct_ret = postprocess(txt_struct_ret) | |
return txt_struct_ret, txt | |
def test(): | |
g2p = G2p() | |
txt = "This is a test sentence." | |
txt_struct, txt = process(txt, g2p) | |
print(txt_struct) | |
print(txt) | |
phone_seq = [p for w in txt_struct for p in w[1]] | |
print(phone_seq) | |
phone_id = [PHPONE2ID[p] for p in phone_seq] | |
print(phone_id) | |
class G2pProcessor: | |
def __init__(self): | |
self.g2p = G2p() | |
def __call__(self, txt, lang="en"): | |
return self.txt2phoneid(txt) | |
def txt2phoneid(self, txt): | |
txt_struct, txt = process(txt, self.g2p) | |
phone_seq = [p for w in txt_struct for p in w[1]] | |
phone_id = [PHPONE2ID[p] for p in phone_seq] | |
return None, phone_id | |
def phoneid2txt(self, phone_id): | |
txt = [] | |
for i in phone_id: | |
txt.append(PHONE_SET[i]) | |
return txt | |
if __name__ == "__main__": | |
g2p = G2pProcessor() | |
txt = "This is a test sentence." | |
phoneid = g2p.txt2phoneid(txt)[1] | |
# output: [5, 73, 118, 175, 218, 116, 213, 218, 28, 218, 180, 82, 179, 181, 218, 174, 82, 149, 185, 30, 149, 175, 6] | |
# print(phoneid) | |
print(g2p.phoneid2txt(phoneid)) | |
# output: ['<BOS>', 'DHB', 'IH1I', 'SE', '|', 'IH1B', 'ZE', '|', 'AH0B', '|', 'TB', 'EH1I', 'SI', 'TE', '|', 'SB', 'EH1I', 'NI', 'TI', 'AH0I', 'NI', 'SE', '<EOS>'] | |
print(len(PHONE_SET)) | |
# output: 219 | |