""" adapted from https://github.com/keithito/tacotron """ import re import numpy as np from . import cleaners from python.common.text.symbols import get_symbols from .cmudict import CMUDict from python.common.text.numbers import _currency_re, _expand_currency ######### # REGEX # ######### # Regular expression matching text enclosed in curly braces for encoding _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') # Regular expression matching words and not words _words_re = re.compile(r"([a-zA-ZÀ-ž]+['][a-zA-ZÀ-ž]{1,2}|[a-zA-ZÀ-ž]+)|([{][^}]+[}]|[^a-zA-ZÀ-ž{}]+)") # Regular expression separating words enclosed in curly braces for cleaning _arpa_re = re.compile(r'{[^}]+}|\S+') def lines_to_list(filename): with open(filename, encoding='utf-8') as f: lines = f.readlines() lines = [l.rstrip() for l in lines] return lines class TextProcessing(object): def __init__(self, symbol_set, cleaner_names, p_arpabet=0.0, handle_arpabet='word', handle_arpabet_ambiguous='ignore', expand_currency=True): self.symbols = get_symbols(symbol_set) self.cleaner_names = cleaner_names # Mappings from symbol to numeric ID and vice versa: self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)} self.id_to_symbol = {i: s for i, s in enumerate(self.symbols)} self.expand_currency = expand_currency # cmudict self.p_arpabet = p_arpabet self.handle_arpabet = handle_arpabet self.handle_arpabet_ambiguous = handle_arpabet_ambiguous def text_to_sequence(self, text): sequence = [] # Check for curly braces and treat their contents as ARPAbet: while len(text): m = _curly_re.match(text) if not m: sequence += self.symbols_to_sequence(text) break sequence += self.symbols_to_sequence(m.group(1)) sequence += self.arpabet_to_sequence(m.group(2)) text = m.group(3) return sequence def sequence_to_text(self, sequence): # result = '' result = [] for symbol_id in sequence: if symbol_id in self.id_to_symbol: s = self.id_to_symbol[symbol_id] # Enclose ARPAbet back in curly braces: if len(s) > 1 and s[0] == '@': s = '{%s}' % s[1:] # result += s result.append(s) return "|".join(result) # return result.replace('}{', ' ') def clean_text(self, text): for name in self.cleaner_names: cleaner = getattr(cleaners, name) if not cleaner: raise Exception('Unknown cleaner: %s' % name) text = cleaner(text) return text def symbols_to_sequence(self, symbols): return [self.symbol_to_id[s] for s in symbols if s in self.symbol_to_id] def arpabet_to_sequence(self, text): return self.symbols_to_sequence(['@' + s for s in text.split()]) def get_arpabet(self, word): arpabet_suffix = '' if word.lower() in cmudict.heteronyms: return word if len(word) > 2 and word.endswith("'s"): arpabet = cmudict.lookup(word) if arpabet is None: arpabet = self.get_arpabet(word[:-2]) arpabet_suffix = ' Z' elif len(word) > 1 and word.endswith("s"): arpabet = cmudict.lookup(word) if arpabet is None: arpabet = self.get_arpabet(word[:-1]) arpabet_suffix = ' Z' else: arpabet = cmudict.lookup(word) if arpabet is None: return word elif arpabet[0] == '{': arpabet = [arpabet[1:-1]] if len(arpabet) > 1: if self.handle_arpabet_ambiguous == 'first': arpabet = arpabet[0] elif self.handle_arpabet_ambiguous == 'random': arpabet = np.random.choice(arpabet) elif self.handle_arpabet_ambiguous == 'ignore': return word else: arpabet = arpabet[0] arpabet = "{" + arpabet + arpabet_suffix + "}" return arpabet # def get_characters(self, word): # for name in self.cleaner_names: # cleaner = getattr(cleaners, f'{name}_post_chars') # if not cleaner: # raise Exception('Unknown cleaner: %s' % name) # word = cleaner(word) # return word def capitalize_repetitions (self, text): text_out = [] for letter in text: if len(text_out)==0: text_out.append(letter) else: if text_out[-1].lower()==letter.lower(): if text_out[-1]==letter.lower(): text_out.append(letter.upper()) elif text_out[-1]==letter.upper(): text_out.append(letter.lower()) else: text_out.append(letter.lower()) return "".join(text_out) def encode_text(self, text, return_all=False): if self.expand_currency: text = re.sub(_currency_re, _expand_currency, text) text_clean = [self.clean_text(split) if split[0] != '{' else split for split in _arpa_re.findall(text)] text_clean = ' '.join(text_clean) text = text_clean text_arpabet = '' if self.p_arpabet > 0: if self.handle_arpabet == 'sentence': if np.random.uniform() < self.p_arpabet: words = _words_re.findall(text) text_arpabet = [ self.get_arpabet(word[0]) if (word[0] != '') else word[1] for word in words] text_arpabet = ''.join(text_arpabet) text = text_arpabet elif self.handle_arpabet == 'word': words = _words_re.findall(text) text_arpabet = [ word[1] if word[0] == '' else ( self.get_arpabet(word[0]) if np.random.uniform() < self.p_arpabet else word[0]) for word in words] text_arpabet = ''.join(text_arpabet) text = text_arpabet elif self.handle_arpabet != '': raise Exception("{} handle_arpabet is not supported".format( self.handle_arpabet)) text = self.capitalize_repetitions(text) text_encoded = self.text_to_sequence(text) if return_all: return text_encoded, text_clean, text_arpabet return text_encoded