File size: 5,526 Bytes
0a06de9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# """ from https://github.com/keithito/tacotron """
# from text import cleaners
# #from text.symbols import symbols


# class TextMapper(object):
#     def __init__(self, vocab_file):
#         self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
#         self.SPACE_ID = self.symbols.index(" ")
#         self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
#         self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

#     def text_to_sequence(self, text, cleaner_names):
#         '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
#         Args:
#         text: string to convert to a sequence
#         cleaner_names: names of the cleaner functions to run the text through
#         Returns:
#         List of integers corresponding to the symbols in the text
#         '''
#         sequence = []
#         clean_text = text.strip()
#         for symbol in clean_text:
#             symbol_id = self._symbol_to_id[symbol]
#             sequence += [symbol_id]
#         return sequence

#     def uromanize(self, text, uroman_pl):
#         iso = "xxx"
#         with tempfile.NamedTemporaryFile() as tf, \
#              tempfile.NamedTemporaryFile() as tf2:
#             with open(tf.name, "w") as f:
#                 f.write("\n".join([text]))
#             cmd = f"perl " + uroman_pl
#             cmd += f" -l {iso} "
#             cmd +=  f" < {tf.name} > {tf2.name}"
#             os.system(cmd)
#             outtexts = []
#             with open(tf2.name) as f:
#                 for line in f:
#                     line =  re.sub(r"\s+", " ", line).strip()
#                     outtexts.append(line)
#             outtext = outtexts[0]
#         return outtext

#     def get_text(self, text, hps):
#         text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
#         if hps.data.add_blank:
#             text_norm = commons.intersperse(text_norm, 0)
#         text_norm = torch.LongTensor(text_norm)
#         return text_norm

#     def filter_oov(self, text):
#         val_chars = self._symbol_to_id
#         txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
#         #print(f"text after filtering OOV: {txt_filt}")
#         return txt_filt

# def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
#     txt = preprocess_char(txt, lang=lang)
#     is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
#     if is_uroman:
#         with tempfile.TemporaryDirectory() as tmp_dir:
#             if uroman_dir is None:
#                 cmd = f"git clone [email protected]:isi-nlp/uroman.git {tmp_dir}"
#                 print(cmd)
#                 subprocess.check_output(cmd, shell=True)
#                 uroman_dir = tmp_dir
#             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
#             print(f"uromanize")
#             txt = text_mapper.uromanize(txt, uroman_pl)
#             print(f"uroman text: {txt}")
#     txt = txt.lower()
#     txt = text_mapper.filter_oov(txt)
#     return txt


# # Mappings from symbol to numeric ID and vice versa:
# mapper = TextMapper("ach/vocab.txt")
# _symbol_to_id = mapper._symbol_to_id#{s: i for i, s in enumerate(symbols)}
# _id_to_symbol = mapper._id_to_symbol#{i: s for i, s in enumerate(symbols)}


# def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
#     txt = preprocess_char(txt, lang=lang)
#     is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
#     if is_uroman:
#         with tempfile.TemporaryDirectory() as tmp_dir:
#             if uroman_dir is None:
#                 cmd = f"git clone [email protected]:isi-nlp/uroman.git {tmp_dir}"
#                 print(cmd)
#                 subprocess.check_output(cmd, shell=True)
#                 uroman_dir = tmp_dir
#             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
#             print(f"uromanize")
#             txt = text_mapper.uromanize(txt, uroman_pl)
#             print(f"uroman text: {txt}")
#     txt = txt.lower()
#     txt = text_mapper.filter_oov(txt)
#     return txt


# def text_to_sequence(text, cleaner_names):
#   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
#     Args:
#       text: string to convert to a sequence
#       cleaner_names: names of the cleaner functions to run the text through
#     Returns:
#       List of integers corresponding to the symbols in the text
#   '''
#   sequence = []

#   clean_text = _clean_text(text, cleaner_names)
#   for symbol in clean_text:
#     symbol_id = _symbol_to_id[symbol]
#     sequence += [symbol_id]
#   return sequence


# def cleaned_text_to_sequence(cleaned_text):
#   '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
#     Args:
#       text: string to convert to a sequence
#     Returns:
#       List of integers corresponding to the symbols in the text
#   '''
#   sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
#   return sequence


# def sequence_to_text(sequence):
#   '''Converts a sequence of IDs back to a string'''
#   result = ''
#   for symbol_id in sequence:
#     s = _id_to_symbol[symbol_id]
#     result += s
#   return result


# def _clean_text(text, cleaner_names):
#   for name in cleaner_names:
#     cleaner = getattr(cleaners, name)
#     if not cleaner:
#       raise Exception('Unknown cleaner: %s' % name)
#     text = cleaner(text)
#   return text