Spaces:
Runtime error
Runtime error
File size: 5,526 Bytes
0a06de9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# """ from https://github.com/keithito/tacotron """
# from text import cleaners
# #from text.symbols import symbols
# class TextMapper(object):
# def __init__(self, vocab_file):
# self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
# self.SPACE_ID = self.symbols.index(" ")
# self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
# self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}
# def text_to_sequence(self, text, cleaner_names):
# '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
# Args:
# text: string to convert to a sequence
# cleaner_names: names of the cleaner functions to run the text through
# Returns:
# List of integers corresponding to the symbols in the text
# '''
# sequence = []
# clean_text = text.strip()
# for symbol in clean_text:
# symbol_id = self._symbol_to_id[symbol]
# sequence += [symbol_id]
# return sequence
# def uromanize(self, text, uroman_pl):
# iso = "xxx"
# with tempfile.NamedTemporaryFile() as tf, \
# tempfile.NamedTemporaryFile() as tf2:
# with open(tf.name, "w") as f:
# f.write("\n".join([text]))
# cmd = f"perl " + uroman_pl
# cmd += f" -l {iso} "
# cmd += f" < {tf.name} > {tf2.name}"
# os.system(cmd)
# outtexts = []
# with open(tf2.name) as f:
# for line in f:
# line = re.sub(r"\s+", " ", line).strip()
# outtexts.append(line)
# outtext = outtexts[0]
# return outtext
# def get_text(self, text, hps):
# text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
# if hps.data.add_blank:
# text_norm = commons.intersperse(text_norm, 0)
# text_norm = torch.LongTensor(text_norm)
# return text_norm
# def filter_oov(self, text):
# val_chars = self._symbol_to_id
# txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
# #print(f"text after filtering OOV: {txt_filt}")
# return txt_filt
# def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
# txt = preprocess_char(txt, lang=lang)
# is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
# if is_uroman:
# with tempfile.TemporaryDirectory() as tmp_dir:
# if uroman_dir is None:
# cmd = f"git clone [email protected]:isi-nlp/uroman.git {tmp_dir}"
# print(cmd)
# subprocess.check_output(cmd, shell=True)
# uroman_dir = tmp_dir
# uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
# print(f"uromanize")
# txt = text_mapper.uromanize(txt, uroman_pl)
# print(f"uroman text: {txt}")
# txt = txt.lower()
# txt = text_mapper.filter_oov(txt)
# return txt
# # Mappings from symbol to numeric ID and vice versa:
# mapper = TextMapper("ach/vocab.txt")
# _symbol_to_id = mapper._symbol_to_id#{s: i for i, s in enumerate(symbols)}
# _id_to_symbol = mapper._id_to_symbol#{i: s for i, s in enumerate(symbols)}
# def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
# txt = preprocess_char(txt, lang=lang)
# is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
# if is_uroman:
# with tempfile.TemporaryDirectory() as tmp_dir:
# if uroman_dir is None:
# cmd = f"git clone [email protected]:isi-nlp/uroman.git {tmp_dir}"
# print(cmd)
# subprocess.check_output(cmd, shell=True)
# uroman_dir = tmp_dir
# uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
# print(f"uromanize")
# txt = text_mapper.uromanize(txt, uroman_pl)
# print(f"uroman text: {txt}")
# txt = txt.lower()
# txt = text_mapper.filter_oov(txt)
# return txt
# def text_to_sequence(text, cleaner_names):
# '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
# Args:
# text: string to convert to a sequence
# cleaner_names: names of the cleaner functions to run the text through
# Returns:
# List of integers corresponding to the symbols in the text
# '''
# sequence = []
# clean_text = _clean_text(text, cleaner_names)
# for symbol in clean_text:
# symbol_id = _symbol_to_id[symbol]
# sequence += [symbol_id]
# return sequence
# def cleaned_text_to_sequence(cleaned_text):
# '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
# Args:
# text: string to convert to a sequence
# Returns:
# List of integers corresponding to the symbols in the text
# '''
# sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
# return sequence
# def sequence_to_text(sequence):
# '''Converts a sequence of IDs back to a string'''
# result = ''
# for symbol_id in sequence:
# s = _id_to_symbol[symbol_id]
# result += s
# return result
# def _clean_text(text, cleaner_names):
# for name in cleaner_names:
# cleaner = getattr(cleaners, name)
# if not cleaner:
# raise Exception('Unknown cleaner: %s' % name)
# text = cleaner(text)
# return text
|