Spaces:

Sunbird
/

luganda2english-stts

Runtime error

File size: 5,526 Bytes

0a06de9

# """ from https://github.com/keithito/tacotron """
# from text import cleaners
# #from text.symbols import symbols


# class TextMapper(object):
#     def __init__(self, vocab_file):
#         self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
#         self.SPACE_ID = self.symbols.index(" ")
#         self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
#         self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

#     def text_to_sequence(self, text, cleaner_names):
#         '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
#         Args:
#         text: string to convert to a sequence
#         cleaner_names: names of the cleaner functions to run the text through
#         Returns:
#         List of integers corresponding to the symbols in the text
#         '''
#         sequence = []
#         clean_text = text.strip()
#         for symbol in clean_text:
#             symbol_id = self._symbol_to_id[symbol]
#             sequence += [symbol_id]
#         return sequence

#     def uromanize(self, text, uroman_pl):
#         iso = "xxx"
#         with tempfile.NamedTemporaryFile() as tf, \
#              tempfile.NamedTemporaryFile() as tf2:
#             with open(tf.name, "w") as f:
#                 f.write("\n".join([text]))
#             cmd = f"perl " + uroman_pl
#             cmd += f" -l {iso} "
#             cmd +=  f" < {tf.name} > {tf2.name}"
#             os.system(cmd)
#             outtexts = []
#             with open(tf2.name) as f:
#                 for line in f:
#                     line =  re.sub(r"\s+", " ", line).strip()
#                     outtexts.append(line)
#             outtext = outtexts[0]
#         return outtext

#     def get_text(self, text, hps):
#         text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
#         if hps.data.add_blank:
#             text_norm = commons.intersperse(text_norm, 0)
#         text_norm = torch.LongTensor(text_norm)
#         return text_norm

#     def filter_oov(self, text):
#         val_chars = self._symbol_to_id
#         txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
#         #print(f"text after filtering OOV: {txt_filt}")
#         return txt_filt

# def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
#     txt = preprocess_char(txt, lang=lang)
#     is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
#     if is_uroman:
#         with tempfile.TemporaryDirectory() as tmp_dir:
#             if uroman_dir is None:
#                 cmd = f"git clone [email protected]:isi-nlp/uroman.git {tmp_dir}"
#                 print(cmd)
#                 subprocess.check_output(cmd, shell=True)
#                 uroman_dir = tmp_dir
#             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
#             print(f"uromanize")
#             txt = text_mapper.uromanize(txt, uroman_pl)
#             print(f"uroman text: {txt}")
#     txt = txt.lower()
#     txt = text_mapper.filter_oov(txt)
#     return txt


# # Mappings from symbol to numeric ID and vice versa:
# mapper = TextMapper("ach/vocab.txt")
# _symbol_to_id = mapper._symbol_to_id#{s: i for i, s in enumerate(symbols)}
# _id_to_symbol = mapper._id_to_symbol#{i: s for i, s in enumerate(symbols)}


# def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
#     txt = preprocess_char(txt, lang=lang)
#     is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
#     if is_uroman:
#         with tempfile.TemporaryDirectory() as tmp_dir:
#             if uroman_dir is None:
#                 cmd = f"git clone [email protected]:isi-nlp/uroman.git {tmp_dir}"
#                 print(cmd)
#                 subprocess.check_output(cmd, shell=True)
#                 uroman_dir = tmp_dir
#             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
#             print(f"uromanize")
#             txt = text_mapper.uromanize(txt, uroman_pl)
#             print(f"uroman text: {txt}")
#     txt = txt.lower()
#     txt = text_mapper.filter_oov(txt)
#     return txt


# def text_to_sequence(text, cleaner_names):
#   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
#     Args:
#       text: string to convert to a sequence
#       cleaner_names: names of the cleaner functions to run the text through
#     Returns:
#       List of integers corresponding to the symbols in the text
#   '''
#   sequence = []

#   clean_text = _clean_text(text, cleaner_names)
#   for symbol in clean_text:
#     symbol_id = _symbol_to_id[symbol]
#     sequence += [symbol_id]
#   return sequence


# def cleaned_text_to_sequence(cleaned_text):
#   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
#     Args:
#       text: string to convert to a sequence
#     Returns:
#       List of integers corresponding to the symbols in the text
#   '''
#   sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
#   return sequence


# def sequence_to_text(sequence):
#   '''Converts a sequence of IDs back to a string'''
#   result = ''
#   for symbol_id in sequence:
#     s = _id_to_symbol[symbol_id]
#     result += s
#   return result


# def _clean_text(text, cleaner_names):
#   for name in cleaner_names:
#     cleaner = getattr(cleaners, name)
#     if not cleaner:
#       raise Exception('Unknown cleaner: %s' % name)
#     text = cleaner(text)
#   return text