|
|
|
|
|
|
|
|
|
|
|
import re |
|
from g2p_en import G2p |
|
from string import punctuation |
|
|
|
|
|
def read_lexicon(lex_path): |
|
lexicon = {} |
|
with open(lex_path) as f: |
|
for line in f: |
|
temp = re.split(r"\s+", line.strip("\n")) |
|
word = temp[0] |
|
phones = temp[1:] |
|
if word.lower() not in lexicon: |
|
lexicon[word.lower()] = phones |
|
return lexicon |
|
|
|
|
|
def preprocess_english(text, lexicon): |
|
text = text.rstrip(punctuation) |
|
|
|
g2p = G2p() |
|
phones = [] |
|
words = re.split(r"([,;.\-\?\!\s+])", text) |
|
for w in words: |
|
if w.lower() in lexicon: |
|
phones += lexicon[w.lower()] |
|
else: |
|
phones += list(filter(lambda p: p != " ", g2p(w))) |
|
phones = "}{".join(phones) |
|
phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones) |
|
phones = phones.replace("}{", " ") |
|
|
|
return phones |
|
|