mekaneeky
Go GO
0a06de9
raw
history blame
No virus
5.53 kB
# """ from https://github.com/keithito/tacotron """
# from text import cleaners
# #from text.symbols import symbols
# class TextMapper(object):
# def __init__(self, vocab_file):
# self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
# self.SPACE_ID = self.symbols.index(" ")
# self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
# self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
# def text_to_sequence(self, text, cleaner_names):
# '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
# Args:
# text: string to convert to a sequence
# cleaner_names: names of the cleaner functions to run the text through
# Returns:
# List of integers corresponding to the symbols in the text
# '''
# sequence = []
# clean_text = text.strip()
# for symbol in clean_text:
# symbol_id = self._symbol_to_id[symbol]
# sequence += [symbol_id]
# return sequence
# def uromanize(self, text, uroman_pl):
# iso = "xxx"
# with tempfile.NamedTemporaryFile() as tf, \
# tempfile.NamedTemporaryFile() as tf2:
# with open(tf.name, "w") as f:
# f.write("\n".join([text]))
# cmd = f"perl " + uroman_pl
# cmd += f" -l {iso} "
# cmd += f" < {tf.name} > {tf2.name}"
# os.system(cmd)
# outtexts = []
# with open(tf2.name) as f:
# for line in f:
# line = re.sub(r"\s+", " ", line).strip()
# outtexts.append(line)
# outtext = outtexts[0]
# return outtext
# def get_text(self, text, hps):
# text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
# if hps.data.add_blank:
# text_norm = commons.intersperse(text_norm, 0)
# text_norm = torch.LongTensor(text_norm)
# return text_norm
# def filter_oov(self, text):
# val_chars = self._symbol_to_id
# txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
# #print(f"text after filtering OOV: {txt_filt}")
# return txt_filt
# def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
# txt = preprocess_char(txt, lang=lang)
# is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
# if is_uroman:
# with tempfile.TemporaryDirectory() as tmp_dir:
# if uroman_dir is None:
# cmd = f"git clone [email protected]:isi-nlp/uroman.git {tmp_dir}"
# print(cmd)
# subprocess.check_output(cmd, shell=True)
# uroman_dir = tmp_dir
# uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
# print(f"uromanize")
# txt = text_mapper.uromanize(txt, uroman_pl)
# print(f"uroman text: {txt}")
# txt = txt.lower()
# txt = text_mapper.filter_oov(txt)
# return txt
# # Mappings from symbol to numeric ID and vice versa:
# mapper = TextMapper("ach/vocab.txt")
# _symbol_to_id = mapper._symbol_to_id#{s: i for i, s in enumerate(symbols)}
# _id_to_symbol = mapper._id_to_symbol#{i: s for i, s in enumerate(symbols)}
# def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
# txt = preprocess_char(txt, lang=lang)
# is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
# if is_uroman:
# with tempfile.TemporaryDirectory() as tmp_dir:
# if uroman_dir is None:
# cmd = f"git clone [email protected]:isi-nlp/uroman.git {tmp_dir}"
# print(cmd)
# subprocess.check_output(cmd, shell=True)
# uroman_dir = tmp_dir
# uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
# print(f"uromanize")
# txt = text_mapper.uromanize(txt, uroman_pl)
# print(f"uroman text: {txt}")
# txt = txt.lower()
# txt = text_mapper.filter_oov(txt)
# return txt
# def text_to_sequence(text, cleaner_names):
# '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
# Args:
# text: string to convert to a sequence
# cleaner_names: names of the cleaner functions to run the text through
# Returns:
# List of integers corresponding to the symbols in the text
# '''
# sequence = []
# clean_text = _clean_text(text, cleaner_names)
# for symbol in clean_text:
# symbol_id = _symbol_to_id[symbol]
# sequence += [symbol_id]
# return sequence
# def cleaned_text_to_sequence(cleaned_text):
# '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
# Args:
# text: string to convert to a sequence
# Returns:
# List of integers corresponding to the symbols in the text
# '''
# sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
# return sequence
# def sequence_to_text(sequence):
# '''Converts a sequence of IDs back to a string'''
# result = ''
# for symbol_id in sequence:
# s = _id_to_symbol[symbol_id]
# result += s
# return result
# def _clean_text(text, cleaner_names):
# for name in cleaner_names:
# cleaner = getattr(cleaners, name)
# if not cleaner:
# raise Exception('Unknown cleaner: %s' % name)
# text = cleaner(text)
# return text