Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
""" adapted from https://github.com/keithito/tacotron """ | |
import re | |
import numpy as np | |
from . import cleaners | |
from python.common.text.symbols import get_symbols | |
from .cmudict import CMUDict | |
from python.common.text.numbers import _currency_re, _expand_currency | |
######### | |
# REGEX # | |
######### | |
# Regular expression matching text enclosed in curly braces for encoding | |
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') | |
# Regular expression matching words and not words | |
_words_re = re.compile(r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]{1,2}|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)") | |
# Regular expression separating words enclosed in curly braces for cleaning | |
_arpa_re = re.compile(r'{[^}]+}|\S+') | |
def lines_to_list(filename): | |
with open(filename, encoding='utf-8') as f: | |
lines = f.readlines() | |
lines = [l.rstrip() for l in lines] | |
return lines | |
class TextProcessing(object): | |
def __init__(self, symbol_set, cleaner_names, p_arpabet=0.0, | |
handle_arpabet='word', handle_arpabet_ambiguous='ignore', | |
expand_currency=True): | |
self.symbols = get_symbols(symbol_set) | |
self.cleaner_names = cleaner_names | |
# Mappings from symbol to numeric ID and vice versa: | |
self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)} | |
self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)} | |
self.expand_currency = expand_currency | |
# cmudict | |
self.p_arpabet = p_arpabet | |
self.handle_arpabet = handle_arpabet | |
self.handle_arpabet_ambiguous = handle_arpabet_ambiguous | |
def text_to_sequence(self, text): | |
sequence = [] | |
# Check for curly braces and treat their contents as ARPAbet: | |
while len(text): | |
m = _curly_re.match(text) | |
if not m: | |
sequence += self.symbols_to_sequence(text) | |
break | |
sequence += self.symbols_to_sequence(m.group(1)) | |
sequence += self.arpabet_to_sequence(m.group(2)) | |
text = m.group(3) | |
return sequence | |
def sequence_to_text(self, sequence): | |
# result = '' | |
result = [] | |
for symbol_id in sequence: | |
if symbol_id in self.id_to_symbol: | |
s = self.id_to_symbol[symbol_id] | |
# Enclose ARPAbet back in curly braces: | |
if len(s) > 1 and s[0] == '@': | |
s = '{%s}' % s[1:] | |
# result += s | |
result.append(s) | |
return "|".join(result) | |
# return result.replace('}{', ' ') | |
def clean_text(self, text): | |
for name in self.cleaner_names: | |
cleaner = getattr(cleaners, name) | |
if not cleaner: | |
raise Exception('Unknown cleaner: %s' % name) | |
text = cleaner(text) | |
return text | |
def symbols_to_sequence(self, symbols): | |
return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id] | |
def arpabet_to_sequence(self, text): | |
return self.symbols_to_sequence(['@' + s for s in text.split()]) | |
def get_arpabet(self, word): | |
arpabet_suffix = '' | |
if word.lower() in cmudict.heteronyms: | |
return word | |
if len(word) > 2 and word.endswith("'s"): | |
arpabet = cmudict.lookup(word) | |
if arpabet is None: | |
arpabet = self.get_arpabet(word[:-2]) | |
arpabet_suffix = ' Z' | |
elif len(word) > 1 and word.endswith("s"): | |
arpabet = cmudict.lookup(word) | |
if arpabet is None: | |
arpabet = self.get_arpabet(word[:-1]) | |
arpabet_suffix = ' Z' | |
else: | |
arpabet = cmudict.lookup(word) | |
if arpabet is None: | |
return word | |
elif arpabet[0] == '{': | |
arpabet = [arpabet[1:-1]] | |
if len(arpabet) > 1: | |
if self.handle_arpabet_ambiguous == 'first': | |
arpabet = arpabet[0] | |
elif self.handle_arpabet_ambiguous == 'random': | |
arpabet = np.random.choice(arpabet) | |
elif self.handle_arpabet_ambiguous == 'ignore': | |
return word | |
else: | |
arpabet = arpabet[0] | |
arpabet = "{" + arpabet + arpabet_suffix + "}" | |
return arpabet | |
# def get_characters(self, word): | |
# for name in self.cleaner_names: | |
# cleaner = getattr(cleaners, f'{name}_post_chars') | |
# if not cleaner: | |
# raise Exception('Unknown cleaner: %s' % name) | |
# word = cleaner(word) | |
# return word | |
def capitalize_repetitions (self, text): | |
text_out = [] | |
for letter in text: | |
if len(text_out)==0: | |
text_out.append(letter) | |
else: | |
if text_out[-1].lower()==letter.lower(): | |
if text_out[-1]==letter.lower(): | |
text_out.append(letter.upper()) | |
elif text_out[-1]==letter.upper(): | |
text_out.append(letter.lower()) | |
else: | |
text_out.append(letter.lower()) | |
return "".join(text_out) | |
def encode_text(self, text, return_all=False): | |
if self.expand_currency: | |
text = re.sub(_currency_re, _expand_currency, text) | |
text_clean = [self.clean_text(split) if split[0] != '{' else split | |
for split in _arpa_re.findall(text)] | |
text_clean = ' '.join(text_clean) | |
text = text_clean | |
text_arpabet = '' | |
if self.p_arpabet > 0: | |
if self.handle_arpabet == 'sentence': | |
if np.random.uniform() < self.p_arpabet: | |
words = _words_re.findall(text) | |
text_arpabet = [ | |
self.get_arpabet(word[0]) | |
if (word[0] != '') else word[1] | |
for word in words] | |
text_arpabet = ''.join(text_arpabet) | |
text = text_arpabet | |
elif self.handle_arpabet == 'word': | |
words = _words_re.findall(text) | |
text_arpabet = [ | |
word[1] if word[0] == '' else ( | |
self.get_arpabet(word[0]) | |
if np.random.uniform() < self.p_arpabet | |
else word[0]) | |
for word in words] | |
text_arpabet = ''.join(text_arpabet) | |
text = text_arpabet | |
elif self.handle_arpabet != '': | |
raise Exception("{} handle_arpabet is not supported".format( | |
self.handle_arpabet)) | |
text = self.capitalize_repetitions(text) | |
text_encoded = self.text_to_sequence(text) | |
if return_all: | |
return text_encoded, text_clean, text_arpabet | |
return text_encoded | |