import re import sys import panphon import phonemizer import torch from Preprocessing.papercup_features import generate_feature_table class ArticulatoryCombinedTextFrontend: def __init__(self, language, use_word_boundaries=False, # goes together well with # parallel models and an aligner. Doesn't go together # well with autoregressive models. use_explicit_eos=True, use_prosody=False, # unfortunately the non-segmental # nature of prosodic markers mixed with the sequential # phonemes hurts the performance of end-to-end models a # lot, even though one might think enriching the input # with such information would help. use_lexical_stress=False, silent=True, allow_unknown=False, add_silence_to_end=True, strip_silence=True): """ Mostly preparing ID lookups """ self.strip_silence = strip_silence self.use_word_boundaries = use_word_boundaries self.allow_unknown = allow_unknown self.use_explicit_eos = use_explicit_eos self.use_prosody = use_prosody self.use_stress = use_lexical_stress self.add_silence_to_end = add_silence_to_end self.feature_table = panphon.FeatureTable() if language == "en": self.g2p_lang = "en-us" self.expand_abbreviations = english_text_expansion if not silent: print("Created an English Text-Frontend") elif language == "de": self.g2p_lang = "de" self.expand_abbreviations = lambda x: x if not silent: print("Created a German Text-Frontend") elif language == "el": self.g2p_lang = "el" self.expand_abbreviations = lambda x: x if not silent: print("Created a Greek Text-Frontend") elif language == "es": self.g2p_lang = "es" self.expand_abbreviations = lambda x: x if not silent: print("Created a Spanish Text-Frontend") elif language == "fi": self.g2p_lang = "fi" self.expand_abbreviations = lambda x: x if not silent: print("Created a Finnish Text-Frontend") elif language == "ru": self.g2p_lang = "ru" self.expand_abbreviations = lambda x: x if not silent: print("Created a Russian Text-Frontend") elif language == "hu": self.g2p_lang = "hu" self.expand_abbreviations = lambda x: x if not silent: print("Created a Hungarian Text-Frontend") elif language == "nl": self.g2p_lang = "nl" self.expand_abbreviations = lambda x: x if not silent: print("Created a Dutch Text-Frontend") elif language == "fr": self.g2p_lang = "fr-fr" self.expand_abbreviations = lambda x: x if not silent: print("Created a French Text-Frontend") elif language == "it": self.g2p_lang = "it" self.expand_abbreviations = lambda x: x if not silent: print("Created a Italian Text-Frontend") elif language == "pt": self.g2p_lang = "pt" self.expand_abbreviations = lambda x: x if not silent: print("Created a Portuguese Text-Frontend") elif language == "pl": self.g2p_lang = "pl" self.expand_abbreviations = lambda x: x if not silent: print("Created a Polish Text-Frontend") # remember to also update get_language_id() when adding something here else: print("Language not supported yet") sys.exit() self.phone_to_vector_papercup = generate_feature_table() self.phone_to_vector = dict() for phone in self.phone_to_vector_papercup: panphon_features = self.feature_table.word_to_vector_list(phone, numeric=True) if panphon_features == []: panphon_features = [[0] * 24] papercup_features = self.phone_to_vector_papercup[phone] self.phone_to_vector[phone] = papercup_features + panphon_features[0] self.phone_to_id = { # this lookup must be updated manually, because the only # other way would be extracting them from a set, which can be non-deterministic '~': 0, '#': 1, '?': 2, '!': 3, '.': 4, 'ɜ': 5, 'ɫ': 6, 'ə': 7, 'ɚ': 8, 'a': 9, 'ð': 10, 'ɛ': 11, 'ɪ': 12, 'ᵻ': 13, 'ŋ': 14, 'ɔ': 15, 'ɒ': 16, 'ɾ': 17, 'ʃ': 18, 'θ': 19, 'ʊ': 20, 'ʌ': 21, 'ʒ': 22, 'æ': 23, 'b': 24, 'ʔ': 25, 'd': 26, 'e': 27, 'f': 28, 'g': 29, 'h': 30, 'i': 31, 'j': 32, 'k': 33, 'l': 34, 'm': 35, 'n': 36, 'ɳ': 37, 'o': 38, 'p': 39, 'ɡ': 40, 'ɹ': 41, 'r': 42, 's': 43, 't': 44, 'u': 45, 'v': 46, 'w': 47, 'x': 48, 'z': 49, 'ʀ': 50, 'ø': 51, 'ç': 52, 'ɐ': 53, 'œ': 54, 'y': 55, 'ʏ': 56, 'ɑ': 57, 'c': 58, 'ɲ': 59, 'ɣ': 60, 'ʎ': 61, 'β': 62, 'ʝ': 63, 'ɟ': 64, 'q': 65, 'ɕ': 66, 'ʲ': 67, 'ɭ': 68, 'ɵ': 69, 'ʑ': 70, 'ʋ': 71, 'ʁ': 72, 'ɨ': 73, 'ʂ': 74, 'ɬ': 75, } # for the states of the ctc loss and dijkstra/mas in the aligner self.id_to_phone = {v: k for k, v in self.phone_to_id.items()} def string_to_tensor(self, text, view=False, device="cpu", handle_missing=True, input_phonemes=False): """ Fixes unicode errors, expands some abbreviations, turns graphemes into phonemes and then vectorizes the sequence as articulatory features """ if input_phonemes: phones = text else: phones = self.get_phone_string(text=text, include_eos_symbol=True) if view: print("Phonemes: \n{}\n".format(phones)) phones_vector = list() # turn into numeric vectors for char in phones: if handle_missing: try: phones_vector.append(self.phone_to_vector[char]) except KeyError: print("unknown phoneme: {}".format(char)) else: phones_vector.append(self.phone_to_vector[char]) # leave error handling to elsewhere return torch.Tensor(phones_vector, device=device) def get_phone_string(self, text, include_eos_symbol=True): # expand abbreviations utt = self.expand_abbreviations(text) # phonemize phones = phonemizer.phonemize(utt, language_switch='remove-flags', backend="espeak", language=self.g2p_lang, preserve_punctuation=True, strip=True, punctuation_marks=';:,.!?¡¿—…"«»“”~/', with_stress=self.use_stress).replace(";", ",").replace("/", " ").replace("—", "") \ .replace(":", ",").replace('"', ",").replace("-", ",").replace("...", ",").replace("-", ",").replace("\n", " ") \ .replace("\t", " ").replace("¡", "").replace("¿", "").replace(",", "~").replace(" ̃", "").replace('̩', "").replace("̃", "").replace("̪", "") # less than 1 wide characters hidden here phones = re.sub("~+", "~", phones) if not self.use_prosody: # retain ~ as heuristic pause marker, even though all other symbols are removed with this option. # also retain . ? and ! since they can be indicators for the stop token phones = phones.replace("ˌ", "").replace("ː", "").replace("ˑ", "") \ .replace("˘", "").replace("|", "").replace("‖", "") if not self.use_word_boundaries: phones = phones.replace(" ", "") else: phones = re.sub(r"\s+", " ", phones) phones = re.sub(" ", "~", phones) if self.strip_silence: phones = phones.lstrip("~").rstrip("~") if self.add_silence_to_end: phones += "~" # adding a silence in the end during add_silence_to_end produces more natural sounding prosody if include_eos_symbol: phones += "#" phones = "~" + phones phones = re.sub("~+", "~", phones) return phones def english_text_expansion(text): """ Apply as small part of the tacotron style text cleaning pipeline, suitable for e.g. LJSpeech. See https://github.com/keithito/tacotron/ Careful: Only apply to english datasets. Different languages need different cleaners. """ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [('Mrs.', 'misess'), ('Mr.', 'mister'), ('Dr.', 'doctor'), ('St.', 'saint'), ('Co.', 'company'), ('Jr.', 'junior'), ('Maj.', 'major'), ('Gen.', 'general'), ('Drs.', 'doctors'), ('Rev.', 'reverend'), ('Lt.', 'lieutenant'), ('Hon.', 'honorable'), ('Sgt.', 'sergeant'), ('Capt.', 'captain'), ('Esq.', 'esquire'), ('Ltd.', 'limited'), ('Col.', 'colonel'), ('Ft.', 'fort')]] for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text def get_language_id(language): if language == "en": return torch.LongTensor([12]) elif language == "de": return torch.LongTensor([1]) elif language == "el": return torch.LongTensor([2]) elif language == "es": return torch.LongTensor([3]) elif language == "fi": return torch.LongTensor([4]) elif language == "ru": return torch.LongTensor([5]) elif language == "hu": return torch.LongTensor([6]) elif language == "nl": return torch.LongTensor([7]) elif language == "fr": return torch.LongTensor([8]) elif language == "pt": return torch.LongTensor([9]) elif language == "pl": return torch.LongTensor([10]) elif language == "it": return torch.LongTensor([11]) if __name__ == '__main__': # test an English utterance tfr_en = ArticulatoryCombinedTextFrontend(language="en") print(tfr_en.string_to_tensor("This is a complex sentence, it even has a pause! But can it do this? Nice.", view=True)) tfr_en = ArticulatoryCombinedTextFrontend(language="de") print(tfr_en.string_to_tensor("Alles klar, jetzt testen wir einen deutschen Satz. Ich hoffe es gibt nicht mehr viele unspezifizierte Phoneme.", view=True))