Spaces:
Running
Running
Vineel Pratap
commited on
Commit
•
78e8beb
1
Parent(s):
d15da79
norm
Browse files- app.py +32 -8
- normalization/README.txt +3 -0
- normalization/__init__.py +0 -0
- normalization/norm_config.py +276 -0
- normalization/punctuations.lst +188 -0
- normalization/text_norm.py +92 -0
- zeroshot.py +5 -4
app.py
CHANGED
@@ -84,7 +84,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
84 |
with gr.Accordion("Logs", open=False):
|
85 |
logs = gr.Textbox(show_label=False)
|
86 |
|
87 |
-
# hack
|
88 |
reference = gr.Textbox(label="Reference Transcript", visible=False)
|
89 |
|
90 |
btn.click(
|
@@ -97,7 +97,7 @@ with gr.Blocks(css="style.css") as demo:
|
|
97 |
lmscore,
|
98 |
wscore_usedefault,
|
99 |
lmscore_usedefault,
|
100 |
-
reference
|
101 |
],
|
102 |
outputs=[text, logs],
|
103 |
)
|
@@ -106,9 +106,21 @@ with gr.Blocks(css="style.css") as demo:
|
|
106 |
gr.Examples(
|
107 |
examples=[
|
108 |
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
|
109 |
-
[
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
],
|
113 |
inputs=[audio, words_file, reference],
|
114 |
label="English",
|
@@ -116,9 +128,21 @@ with gr.Blocks(css="style.css") as demo:
|
|
116 |
gr.Examples(
|
117 |
examples=[
|
118 |
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
|
119 |
-
[
|
120 |
-
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
],
|
123 |
inputs=[audio, words_file, reference],
|
124 |
label="Ligurian",
|
|
|
84 |
with gr.Accordion("Logs", open=False):
|
85 |
logs = gr.Textbox(show_label=False)
|
86 |
|
87 |
+
# hack
|
88 |
reference = gr.Textbox(label="Reference Transcript", visible=False)
|
89 |
|
90 |
btn.click(
|
|
|
97 |
lmscore,
|
98 |
wscore_usedefault,
|
99 |
lmscore_usedefault,
|
100 |
+
reference,
|
101 |
],
|
102 |
outputs=[text, logs],
|
103 |
)
|
|
|
106 |
gr.Examples(
|
107 |
examples=[
|
108 |
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
|
109 |
+
[
|
110 |
+
"upload/english/english.mp3",
|
111 |
+
"upload/english/c4_10k_sentences.txt",
|
112 |
+
" This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
|
113 |
+
],
|
114 |
+
[
|
115 |
+
"upload/english/english.mp3",
|
116 |
+
"upload/english/c4_5k_sentences.txt",
|
117 |
+
" This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
|
118 |
+
],
|
119 |
+
[
|
120 |
+
"upload/english/english.mp3",
|
121 |
+
"upload/english/cv8_top10k_words.txt",
|
122 |
+
" This is going to look at the code that we have in our configuration that we've already exported and compare it to our database, and we want to import",
|
123 |
+
],
|
124 |
],
|
125 |
inputs=[audio, words_file, reference],
|
126 |
label="English",
|
|
|
128 |
gr.Examples(
|
129 |
examples=[
|
130 |
# ["upload/english/english.mp3", "upload/english/c4_25k_sentences.txt"],
|
131 |
+
[
|
132 |
+
"upload/ligurian/ligurian_1.mp3",
|
133 |
+
"upload/ligurian/zenamt_10k_sentences.txt",
|
134 |
+
"I mæ colleghi m’an domandou d’aggiuttâli à fâ unna preuva co-o zeneise pe vedde s’o fonçioña.",
|
135 |
+
],
|
136 |
+
[
|
137 |
+
"upload/ligurian/ligurian_2.mp3",
|
138 |
+
"upload/ligurian/zenamt_10k_sentences.txt",
|
139 |
+
"Staseia vaggo à çenâ con mæ moggê e doî amixi che de chì à quarche settemaña faian stramuo feua stato.",
|
140 |
+
],
|
141 |
+
[
|
142 |
+
"upload/ligurian/ligurian_3.mp3",
|
143 |
+
"upload/ligurian/zenamt_5k_sentences.txt",
|
144 |
+
"Pe inandiâ o pesto ghe veu o baxaicò, i pigneu, l’euio, o formaggio, l’aggio e a sâ.",
|
145 |
+
],
|
146 |
],
|
147 |
inputs=[audio, words_file, reference],
|
148 |
label="Ligurian",
|
normalization/README.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a6aa5ef11df920fccc933f0d0ff4dd982a2872e0e544ab7409507ad6f130b81
|
3 |
+
size 118
|
normalization/__init__.py
ADDED
File without changes
|
normalization/norm_config.py
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
|
5 |
+
colon = ":"
|
6 |
+
comma = ","
|
7 |
+
exclamation_mark = "!"
|
8 |
+
period = re.escape(".")
|
9 |
+
question_mark = re.escape("?")
|
10 |
+
semicolon = ";"
|
11 |
+
|
12 |
+
left_curly_bracket = "{"
|
13 |
+
right_curly_bracket = "}"
|
14 |
+
quotation_mark = '"'
|
15 |
+
|
16 |
+
basic_punc = (
|
17 |
+
period
|
18 |
+
+ question_mark
|
19 |
+
+ comma
|
20 |
+
+ colon
|
21 |
+
+ exclamation_mark
|
22 |
+
+ left_curly_bracket
|
23 |
+
+ right_curly_bracket
|
24 |
+
)
|
25 |
+
|
26 |
+
# General punc unicode block (0x2000-0x206F)
|
27 |
+
zero_width_space = r"\u200B"
|
28 |
+
zero_width_nonjoiner = r"\u200C"
|
29 |
+
left_to_right_mark = r"\u200E"
|
30 |
+
right_to_left_mark = r"\u200F"
|
31 |
+
left_to_right_embedding = r"\u202A"
|
32 |
+
pop_directional_formatting = r"\u202C"
|
33 |
+
|
34 |
+
# Here are some commonly ill-typed versions of apostrophe
|
35 |
+
right_single_quotation_mark = r"\u2019"
|
36 |
+
left_single_quotation_mark = r"\u2018"
|
37 |
+
|
38 |
+
# Language specific definitions
|
39 |
+
# Spanish
|
40 |
+
inverted_exclamation_mark = r"\u00A1"
|
41 |
+
inverted_question_mark = r"\u00BF"
|
42 |
+
|
43 |
+
|
44 |
+
# Hindi
|
45 |
+
hindi_danda = u"\u0964"
|
46 |
+
|
47 |
+
# Egyptian Arabic
|
48 |
+
# arabic_percent = r"\u066A"
|
49 |
+
arabic_comma = r"\u060C"
|
50 |
+
arabic_question_mark = r"\u061F"
|
51 |
+
arabic_semicolon = r"\u061B"
|
52 |
+
arabic_diacritics = r"\u064B-\u0652"
|
53 |
+
|
54 |
+
|
55 |
+
arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
|
56 |
+
|
57 |
+
|
58 |
+
# Chinese
|
59 |
+
full_stop = r"\u3002"
|
60 |
+
full_comma = r"\uFF0C"
|
61 |
+
full_exclamation_mark = r"\uFF01"
|
62 |
+
full_question_mark = r"\uFF1F"
|
63 |
+
full_semicolon = r"\uFF1B"
|
64 |
+
full_colon = r"\uFF1A"
|
65 |
+
full_parentheses = r"\uFF08\uFF09"
|
66 |
+
quotation_mark_horizontal = r"\u300C-\u300F"
|
67 |
+
quotation_mark_vertical = r"\uFF41-\uFF44"
|
68 |
+
title_marks = r"\u3008-\u300B"
|
69 |
+
wavy_low_line = r"\uFE4F"
|
70 |
+
ellipsis = r"\u22EF"
|
71 |
+
enumeration_comma = r"\u3001"
|
72 |
+
hyphenation_point = r"\u2027"
|
73 |
+
forward_slash = r"\uFF0F"
|
74 |
+
wavy_dash = r"\uFF5E"
|
75 |
+
box_drawings_light_horizontal = r"\u2500"
|
76 |
+
fullwidth_low_line = r"\uFF3F"
|
77 |
+
chinese_punc = (
|
78 |
+
full_stop
|
79 |
+
+ full_comma
|
80 |
+
+ full_exclamation_mark
|
81 |
+
+ full_question_mark
|
82 |
+
+ full_semicolon
|
83 |
+
+ full_colon
|
84 |
+
+ full_parentheses
|
85 |
+
+ quotation_mark_horizontal
|
86 |
+
+ quotation_mark_vertical
|
87 |
+
+ title_marks
|
88 |
+
+ wavy_low_line
|
89 |
+
+ ellipsis
|
90 |
+
+ enumeration_comma
|
91 |
+
+ hyphenation_point
|
92 |
+
+ forward_slash
|
93 |
+
+ wavy_dash
|
94 |
+
+ box_drawings_light_horizontal
|
95 |
+
+ fullwidth_low_line
|
96 |
+
)
|
97 |
+
|
98 |
+
# Armenian
|
99 |
+
armenian_apostrophe = r"\u055A"
|
100 |
+
emphasis_mark = r"\u055B"
|
101 |
+
exclamation_mark = r"\u055C"
|
102 |
+
armenian_comma = r"\u055D"
|
103 |
+
armenian_question_mark = r"\u055E"
|
104 |
+
abbreviation_mark = r"\u055F"
|
105 |
+
armenian_full_stop = r"\u0589"
|
106 |
+
armenian_punc = (
|
107 |
+
armenian_apostrophe
|
108 |
+
+ emphasis_mark
|
109 |
+
+ exclamation_mark
|
110 |
+
+ armenian_comma
|
111 |
+
+ armenian_question_mark
|
112 |
+
+ abbreviation_mark
|
113 |
+
+ armenian_full_stop
|
114 |
+
)
|
115 |
+
|
116 |
+
lesser_than_symbol = r"<"
|
117 |
+
greater_than_symbol = r">"
|
118 |
+
|
119 |
+
lesser_than_sign = r"\u003c"
|
120 |
+
greater_than_sign = r"\u003e"
|
121 |
+
|
122 |
+
nbsp_written_form = r" "
|
123 |
+
|
124 |
+
# Quotation marks
|
125 |
+
left_double_quotes = r"\u201c"
|
126 |
+
right_double_quotes = r"\u201d"
|
127 |
+
left_double_angle = r"\u00ab"
|
128 |
+
right_double_angle = r"\u00bb"
|
129 |
+
left_single_angle = r"\u2039"
|
130 |
+
right_single_angle = r"\u203a"
|
131 |
+
low_double_quotes = r"\u201e"
|
132 |
+
low_single_quotes = r"\u201a"
|
133 |
+
high_double_quotes = r"\u201f"
|
134 |
+
high_single_quotes = r"\u201b"
|
135 |
+
|
136 |
+
all_punct_quotes = (
|
137 |
+
left_double_quotes
|
138 |
+
+ right_double_quotes
|
139 |
+
+ left_double_angle
|
140 |
+
+ right_double_angle
|
141 |
+
+ left_single_angle
|
142 |
+
+ right_single_angle
|
143 |
+
+ low_double_quotes
|
144 |
+
+ low_single_quotes
|
145 |
+
+ high_double_quotes
|
146 |
+
+ high_single_quotes
|
147 |
+
+ right_single_quotation_mark
|
148 |
+
+ left_single_quotation_mark
|
149 |
+
)
|
150 |
+
mapping_quotes = (
|
151 |
+
"["
|
152 |
+
+ high_single_quotes
|
153 |
+
+ right_single_quotation_mark
|
154 |
+
+ left_single_quotation_mark
|
155 |
+
+ "]"
|
156 |
+
)
|
157 |
+
|
158 |
+
|
159 |
+
# Digits
|
160 |
+
|
161 |
+
english_digits = r"\u0030-\u0039"
|
162 |
+
bengali_digits = r"\u09e6-\u09ef"
|
163 |
+
khmer_digits = r"\u17e0-\u17e9"
|
164 |
+
devanagari_digits = r"\u0966-\u096f"
|
165 |
+
oriya_digits = r"\u0b66-\u0b6f"
|
166 |
+
extended_arabic_indic_digits = r"\u06f0-\u06f9"
|
167 |
+
kayah_li_digits = r"\ua900-\ua909"
|
168 |
+
fullwidth_digits = r"\uff10-\uff19"
|
169 |
+
malayam_digits = r"\u0d66-\u0d6f"
|
170 |
+
myanmar_digits = r"\u1040-\u1049"
|
171 |
+
roman_numeral = r"\u2170-\u2179"
|
172 |
+
nominal_digit_shapes = r"\u206f"
|
173 |
+
|
174 |
+
# Load punctuations from MMS-lab data
|
175 |
+
with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
|
176 |
+
punc_list = punc_f.readlines()
|
177 |
+
|
178 |
+
punct_pattern = r""
|
179 |
+
for punc in punc_list:
|
180 |
+
# the first character in the tab separated line is the punc to be removed
|
181 |
+
punct_pattern += re.escape(punc.split("\t")[0])
|
182 |
+
|
183 |
+
shared_digits = (
|
184 |
+
english_digits
|
185 |
+
+ bengali_digits
|
186 |
+
+ khmer_digits
|
187 |
+
+ devanagari_digits
|
188 |
+
+ oriya_digits
|
189 |
+
+ extended_arabic_indic_digits
|
190 |
+
+ kayah_li_digits
|
191 |
+
+ fullwidth_digits
|
192 |
+
+ malayam_digits
|
193 |
+
+ myanmar_digits
|
194 |
+
+ roman_numeral
|
195 |
+
+ nominal_digit_shapes
|
196 |
+
)
|
197 |
+
|
198 |
+
shared_punc_list = (
|
199 |
+
basic_punc
|
200 |
+
+ all_punct_quotes
|
201 |
+
+ greater_than_sign
|
202 |
+
+ lesser_than_sign
|
203 |
+
+ inverted_question_mark
|
204 |
+
+ full_stop
|
205 |
+
+ semicolon
|
206 |
+
+ armenian_punc
|
207 |
+
+ inverted_exclamation_mark
|
208 |
+
+ arabic_comma
|
209 |
+
+ enumeration_comma
|
210 |
+
+ hindi_danda
|
211 |
+
+ quotation_mark
|
212 |
+
+ arabic_semicolon
|
213 |
+
+ arabic_question_mark
|
214 |
+
+ chinese_punc
|
215 |
+
+ punct_pattern
|
216 |
+
|
217 |
+
)
|
218 |
+
|
219 |
+
shared_mappping = {
|
220 |
+
lesser_than_symbol: "",
|
221 |
+
greater_than_symbol: "",
|
222 |
+
nbsp_written_form: "",
|
223 |
+
r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
|
224 |
+
}
|
225 |
+
|
226 |
+
shared_deletion_list = (
|
227 |
+
left_to_right_mark
|
228 |
+
+ zero_width_nonjoiner
|
229 |
+
+ arabic_subscript_alef_and_inverted_damma
|
230 |
+
+ zero_width_space
|
231 |
+
+ arabic_diacritics
|
232 |
+
+ pop_directional_formatting
|
233 |
+
+ right_to_left_mark
|
234 |
+
+ left_to_right_embedding
|
235 |
+
)
|
236 |
+
|
237 |
+
norm_config = {
|
238 |
+
"*": {
|
239 |
+
"lower_case": True,
|
240 |
+
"punc_set": shared_punc_list,
|
241 |
+
"del_set": shared_deletion_list,
|
242 |
+
"mapping": shared_mappping,
|
243 |
+
"digit_set": shared_digits,
|
244 |
+
"unicode_norm": "NFKC",
|
245 |
+
"rm_diacritics" : False,
|
246 |
+
}
|
247 |
+
}
|
248 |
+
|
249 |
+
#=============== Mongolian ===============#
|
250 |
+
|
251 |
+
norm_config["mon"] = norm_config["*"].copy()
|
252 |
+
# add soft hyphen to punc list to match with fleurs
|
253 |
+
norm_config["mon"]["del_set"] += r"\u00AD"
|
254 |
+
|
255 |
+
norm_config["khk"] = norm_config["mon"].copy()
|
256 |
+
|
257 |
+
#=============== Hebrew ===============#
|
258 |
+
|
259 |
+
norm_config["heb"] = norm_config["*"].copy()
|
260 |
+
# add "HEBREW POINT" symbols to match with fleurs
|
261 |
+
norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
|
262 |
+
|
263 |
+
#=============== Thai ===============#
|
264 |
+
|
265 |
+
norm_config["tha"] = norm_config["*"].copy()
|
266 |
+
# add "Zero width joiner" symbols to match with fleurs
|
267 |
+
norm_config["tha"]["punc_set"] += r"\u200D"
|
268 |
+
|
269 |
+
#=============== Arabic ===============#
|
270 |
+
norm_config["ara"] = norm_config["*"].copy()
|
271 |
+
norm_config["ara"]["mapping"]["ٱ"] = "ا"
|
272 |
+
norm_config["arb"] = norm_config["ara"].copy()
|
273 |
+
|
274 |
+
#=============== Javanese ===============#
|
275 |
+
norm_config["jav"] = norm_config["*"].copy()
|
276 |
+
norm_config["jav"]["rm_diacritics"] = True
|
normalization/punctuations.lst
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
7355 INVALID UNICODE 0x81
|
2 |
+
5265 INVALID UNICODE 0x90
|
3 |
+
75 INVALID UNICODE 0x8
|
4 |
+
31 INVALID UNICODE 0x8d
|
5 |
+
3 INVALID UNICODE 0x94
|
6 |
+
2 INVALID UNICODE 0x8f
|
7 |
+
2 INVALID UNICODE 0x1a
|
8 |
+
1 INVALID UNICODE 0x9d
|
9 |
+
1 INVALID UNICODE 0x93
|
10 |
+
1 INVALID UNICODE 0x92
|
11 |
+
8647 INVALID UNICODE 0xe295
|
12 |
+
6650 INVALID UNICODE 0xf21d
|
13 |
+
6234 INVALID UNICODE 0xf62d
|
14 |
+
4815 INVALID UNICODE 0xf173
|
15 |
+
4789 INVALID UNICODE 0xe514
|
16 |
+
4409 INVALID UNICODE 0xe293
|
17 |
+
3881 INVALID UNICODE 0xf523
|
18 |
+
3788 INVALID UNICODE 0xe233
|
19 |
+
2448 INVALID UNICODE 0xf50f
|
20 |
+
2177 INVALID UNICODE 0xe232
|
21 |
+
1955 INVALID UNICODE 0xea7b
|
22 |
+
1926 INVALID UNICODE 0xf172
|
23 |
+
973 INVALID UNICODE 0xe290
|
24 |
+
972 INVALID UNICODE 0xf519
|
25 |
+
661 INVALID UNICODE 0xe292
|
26 |
+
591 INVALID UNICODE 0xe328
|
27 |
+
509 INVALID UNICODE 0xe2fa
|
28 |
+
458 INVALID UNICODE 0xe234
|
29 |
+
446 INVALID UNICODE 0xe043
|
30 |
+
419 INVALID UNICODE 0xe040
|
31 |
+
399 INVALID UNICODE 0xe2fb
|
32 |
+
387 INVALID UNICODE 0xe32b
|
33 |
+
381 INVALID UNICODE 0xe236
|
34 |
+
374 INVALID UNICODE 0xf511
|
35 |
+
314 INVALID UNICODE 0xe517
|
36 |
+
296 INVALID UNICODE 0xe2fe
|
37 |
+
293 INVALID UNICODE 0xe492
|
38 |
+
291 INVALID UNICODE 0xf52d
|
39 |
+
289 INVALID UNICODE 0xe2fc
|
40 |
+
195 INVALID UNICODE 0xf521
|
41 |
+
190 INVALID UNICODE 0xe516
|
42 |
+
182 INVALID UNICODE 0xe041
|
43 |
+
178 INVALID UNICODE 0xf529
|
44 |
+
113 INVALID UNICODE 0xe2f9
|
45 |
+
87 INVALID UNICODE 0xe2d9
|
46 |
+
78 INVALID UNICODE 0xe32a
|
47 |
+
76 INVALID UNICODE 0xe291
|
48 |
+
74 INVALID UNICODE 0xe296
|
49 |
+
66 INVALID UNICODE 0xe518
|
50 |
+
52 INVALID UNICODE 0xe32c
|
51 |
+
46 INVALID UNICODE 0xe2db
|
52 |
+
41 INVALID UNICODE 0xe231
|
53 |
+
34 INVALID UNICODE 0xf522
|
54 |
+
33 INVALID UNICODE 0xf518
|
55 |
+
32 INVALID UNICODE 0xf513
|
56 |
+
27 INVALID UNICODE 0xe32d
|
57 |
+
25 INVALID UNICODE 0xe32e
|
58 |
+
23 INVALID UNICODE 0xe06b
|
59 |
+
15 INVALID UNICODE 0xea01
|
60 |
+
12 INVALID UNICODE 0xe294
|
61 |
+
11 INVALID UNICODE 0xe203
|
62 |
+
8 INVALID UNICODE 0xf218
|
63 |
+
7 INVALID UNICODE 0xe070
|
64 |
+
7 INVALID UNICODE 0xe013
|
65 |
+
5 INVALID UNICODE 0xe2de
|
66 |
+
4 INVALID UNICODE 0xe493
|
67 |
+
3 INVALID UNICODE 0xf7e8
|
68 |
+
3 INVALID UNICODE 0xf7d0
|
69 |
+
3 INVALID UNICODE 0xe313
|
70 |
+
2 INVALID UNICODE 0xe329
|
71 |
+
2 INVALID UNICODE 0xe06d
|
72 |
+
2 INVALID UNICODE 0xe003
|
73 |
+
1 INVALID UNICODE 0xf50e
|
74 |
+
1 INVALID UNICODE 0xf171
|
75 |
+
1 INVALID UNICODE 0xe01d
|
76 |
+
71 NOMINAL DIGIT SHAPES 0x206f
|
77 |
+
3 WORD JOINER 0x2060
|
78 |
+
― 126545 HORIZONTAL BAR 0x2015
|
79 |
+
־ 1028 HEBREW PUNCTUATION MAQAF 0x5be
|
80 |
+
) 98429 RIGHT PARENTHESIS 0x29
|
81 |
+
] 27108 RIGHT SQUARE BRACKET 0x5d
|
82 |
+
⌋ 1567 RIGHT FLOOR 0x230b
|
83 |
+
〕 97 RIGHT TORTOISE SHELL BRACKET 0x3015
|
84 |
+
】 36 RIGHT BLACK LENTICULAR BRACKET 0x3011
|
85 |
+
﴾ 14 ORNATE LEFT PARENTHESIS 0xfd3e
|
86 |
+
& 170517 AMPERSAND 0x26
|
87 |
+
། 106330 TIBETAN MARK SHAD 0xf0d
|
88 |
+
። 90203 ETHIOPIC FULL STOP 0x1362
|
89 |
+
፥ 60484 ETHIOPIC COLON 0x1365
|
90 |
+
༌ 60464 TIBETAN MARK DELIMITER TSHEG BSTAR 0xf0c
|
91 |
+
။ 51567 MYANMAR SIGN SECTION 0x104b
|
92 |
+
/ 46929 SOLIDUS 0x2f
|
93 |
+
၊ 38042 MYANMAR SIGN LITTLE SECTION 0x104a
|
94 |
+
· 37985 MIDDLE DOT 0xb7
|
95 |
+
‸ 36310 CARET 0x2038
|
96 |
+
* 34793 ASTERISK 0x2a
|
97 |
+
۔ 32432 ARABIC FULL STOP 0x6d4
|
98 |
+
፤ 31906 ETHIOPIC SEMICOLON 0x1364
|
99 |
+
၏ 21519 MYANMAR SYMBOL GENITIVE 0x104f
|
100 |
+
។ 20834 KHMER SIGN KHAN 0x17d4
|
101 |
+
꓾ 15773 LISU PUNCTUATION COMMA 0xa4fe
|
102 |
+
᙮ 13473 CANADIAN SYLLABICS FULL STOP 0x166e
|
103 |
+
꤯ 12892 KAYAH LI SIGN SHYA 0xa92f
|
104 |
+
⵰ 11478 TIFINAGH SEPARATOR MARK 0x2d70
|
105 |
+
꓿ 11118 LISU PUNCTUATION FULL STOP 0xa4ff
|
106 |
+
॥ 10763 DEVANAGARI DOUBLE DANDA 0x965
|
107 |
+
؞ 10403 ARABIC TRIPLE DOT PUNCTUATION MARK 0x61e
|
108 |
+
၍ 8936 MYANMAR SYMBOL COMPLETED 0x104d
|
109 |
+
· 8431 GREEK ANO TELEIA 0x387
|
110 |
+
† 7477 DAGGER 0x2020
|
111 |
+
၌ 6632 MYANMAR SYMBOL LOCATIVE 0x104c
|
112 |
+
፣ 5719 ETHIOPIC COMMA 0x1363
|
113 |
+
៖ 5528 KHMER SIGN CAMNUC PII KUUH 0x17d6
|
114 |
+
꤮ 4791 KAYAH LI SIGN CWI 0xa92e
|
115 |
+
※ 3439 REFERENCE MARK 0x203b
|
116 |
+
፦ 2727 ETHIOPIC PREFACE COLON 0x1366
|
117 |
+
• 1749 BULLET 0x2022
|
118 |
+
¶ 1507 PILCROW SIGN 0xb6
|
119 |
+
၎ 1386 MYANMAR SYMBOL AFOREMENTIONED 0x104e
|
120 |
+
﹖ 1224 SMALL QUESTION MARK 0xfe56
|
121 |
+
; 975 GREEK QUESTION MARK 0x37e
|
122 |
+
… 827 HORIZONTAL ELLIPSIS 0x2026
|
123 |
+
% 617 PERCENT SIGN 0x25
|
124 |
+
・ 468 KATAKANA MIDDLE DOT 0x30fb
|
125 |
+
༎ 306 TIBETAN MARK NYIS SHAD 0xf0e
|
126 |
+
‡ 140 DOUBLE DAGGER 0x2021
|
127 |
+
# 137 NUMBER SIGN 0x23
|
128 |
+
@ 125 COMMERCIAL AT 0x40
|
129 |
+
፡ 121 ETHIOPIC WORDSPACE 0x1361
|
130 |
+
៚ 55 KHMER SIGN KOOMUUT 0x17da
|
131 |
+
៕ 49 KHMER SIGN BARIYOOSAN 0x17d5
|
132 |
+
﹐ 10 SMALL COMMA 0xfe50
|
133 |
+
༅ 6 TIBETAN MARK CLOSING YIG MGO SGAB MA 0xf05
|
134 |
+
༄ 6 TIBETAN MARK INITIAL YIG MGO MDUN MA 0xf04
|
135 |
+
. 2 FULLWIDTH FULL STOP 0xff0e
|
136 |
+
﹗ 2 SMALL EXCLAMATION MARK 0xfe57
|
137 |
+
﹕ 2 SMALL COLON 0xfe55
|
138 |
+
‰ 2 PER MILLE SIGN 0x2030
|
139 |
+
・ 1 HALFWIDTH KATAKANA MIDDLE DOT 0xff65
|
140 |
+
( 98504 LEFT PARENTHESIS 0x28
|
141 |
+
[ 27245 LEFT SQUARE BRACKET 0x5b
|
142 |
+
⌊ 1567 LEFT FLOOR 0x230a
|
143 |
+
〔 95 LEFT TORTOISE SHELL BRACKET 0x3014
|
144 |
+
【 36 LEFT BLACK LENTICULAR BRACKET 0x3010
|
145 |
+
﴿ 14 ORNATE RIGHT PARENTHESIS 0xfd3f
|
146 |
+
_ 4851 LOW LINE 0x5f
|
147 |
+
$ 72 DOLLAR SIGN 0x24
|
148 |
+
€ 14 EURO SIGN 0x20ac
|
149 |
+
£ 2 POUND SIGN 0xa3
|
150 |
+
~ 27462 TILDE 0x7e
|
151 |
+
= 11450 EQUALS SIGN 0x3d
|
152 |
+
| 8430 VERTICAL LINE 0x7c
|
153 |
+
− 3971 MINUS SIGN 0x2212
|
154 |
+
≫ 1904 MUCH GREATER-THAN 0x226b
|
155 |
+
≪ 1903 MUCH LESS-THAN 0x226a
|
156 |
+
+ 1450 PLUS SIGN 0x2b
|
157 |
+
< 345 FULLWIDTH LESS-THAN SIGN 0xff1c
|
158 |
+
> 344 FULLWIDTH GREATER-THAN SIGN 0xff1e
|
159 |
+
¬ 5 NOT SIGN 0xac
|
160 |
+
× 4 MULTIPLICATION SIGN 0xd7
|
161 |
+
→ 2 RIGHTWARDS ARROW 0x2192
|
162 |
+
᙭ 537 CANADIAN SYLLABICS CHI SIGN 0x166d
|
163 |
+
° 499 DEGREE SIGN 0xb0
|
164 |
+
႟ 421 MYANMAR SYMBOL SHAN EXCLAMATION 0x109f
|
165 |
+
� 192 REPLACEMENT CHARACTER 0xfffd
|
166 |
+
⌟ 54 BOTTOM RIGHT CORNER 0x231f
|
167 |
+
⌞ 54 BOTTOM LEFT CORNER 0x231e
|
168 |
+
© 2 COPYRIGHT SIGN 0xa9
|
169 |
+
40 NARROW NO-BREAK SPACE 0x202f
|
170 |
+
1 SIX-PER-EM SPACE 0x2006
|
171 |
+
˜ 40261 SMALL TILDE 0x2dc
|
172 |
+
^ 6469 CIRCUMFLEX ACCENT 0x5e
|
173 |
+
¯ 20 MACRON 0xaf
|
174 |
+
ˇ 191442 CARON 0x2c7
|
175 |
+
ⁿ 38144 SUPERSCRIPT LATIN SMALL LETTER N 0x207f
|
176 |
+
ـ 9440 ARABIC TATWEEL 0x640
|
177 |
+
ๆ 6766 THAI CHARACTER MAIYAMOK 0xe46
|
178 |
+
ៗ 3310 KHMER SIGN LEK TOO 0x17d7
|
179 |
+
々 678 IDEOGRAPHIC ITERATION MARK 0x3005
|
180 |
+
ໆ 430 LAO KO LA 0xec6
|
181 |
+
ー 319 KATAKANA-HIRAGANA PROLONGED SOUND MARK 0x30fc
|
182 |
+
ⁱ 137 SUPERSCRIPT LATIN SMALL LETTER I 0x2071
|
183 |
+
৷ 11056 BENGALI CURRENCY NUMERATOR FOUR 0x9f7
|
184 |
+
⅓ 26 VULGAR FRACTION ONE THIRD 0x2153
|
185 |
+
½ 26 VULGAR FRACTION ONE HALF 0xbd
|
186 |
+
¼ 4 VULGAR FRACTION ONE QUARTER 0xbc
|
187 |
+
⅟ 1 FRACTION NUMERATOR ONE 0x215f
|
188 |
+
⁄ 57 FRACTION SLASH 0x2044
|
normalization/text_norm.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import unicodedata
|
4 |
+
|
5 |
+
from normalization.norm_config import norm_config
|
6 |
+
|
7 |
+
|
8 |
+
def text_normalize(text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False):
|
9 |
+
|
10 |
+
"""Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
|
11 |
+
|
12 |
+
Args:
|
13 |
+
text : The string to be normalized
|
14 |
+
iso_code :
|
15 |
+
remove_numbers : Boolean flag to specify if words containing only digits should be removed
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
normalized_text : the string after all normalization
|
19 |
+
|
20 |
+
"""
|
21 |
+
|
22 |
+
config = norm_config.get(iso_code, norm_config["*"])
|
23 |
+
|
24 |
+
for field in ["lower_case", "punc_set","del_set", "mapping", "digit_set", "unicode_norm"]:
|
25 |
+
if field not in config:
|
26 |
+
config[field] = norm_config["*"][field]
|
27 |
+
|
28 |
+
|
29 |
+
text = unicodedata.normalize(config["unicode_norm"], text)
|
30 |
+
|
31 |
+
# Convert to lower case
|
32 |
+
|
33 |
+
if config["lower_case"] and lower_case:
|
34 |
+
text = text.lower()
|
35 |
+
|
36 |
+
# brackets
|
37 |
+
|
38 |
+
# always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
|
39 |
+
text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
|
40 |
+
if remove_brackets:
|
41 |
+
text = re.sub(r"\([^\)]*\)", " ", text)
|
42 |
+
|
43 |
+
# Apply mappings
|
44 |
+
|
45 |
+
for old, new in config["mapping"].items():
|
46 |
+
text = re.sub(old, new, text)
|
47 |
+
|
48 |
+
# Replace punctutations with space
|
49 |
+
|
50 |
+
punct_pattern = r"[" + config["punc_set"]
|
51 |
+
|
52 |
+
punct_pattern += "]"
|
53 |
+
|
54 |
+
normalized_text = re.sub(punct_pattern, " ", text)
|
55 |
+
|
56 |
+
# remove characters in delete list
|
57 |
+
|
58 |
+
delete_patten = r"[" + config["del_set"] + "]"
|
59 |
+
|
60 |
+
normalized_text = re.sub(delete_patten, "", normalized_text)
|
61 |
+
|
62 |
+
# Remove words containing only digits
|
63 |
+
# We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
|
64 |
+
# For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
|
65 |
+
# The lookaround enables overlapping pattern matches to be replaced
|
66 |
+
|
67 |
+
if remove_numbers:
|
68 |
+
|
69 |
+
digits_pattern = "[" + config["digit_set"]
|
70 |
+
|
71 |
+
digits_pattern += "]+"
|
72 |
+
|
73 |
+
complete_digit_pattern = (
|
74 |
+
r"^"
|
75 |
+
+ digits_pattern
|
76 |
+
+ "(?=\s)|(?<=\s)"
|
77 |
+
+ digits_pattern
|
78 |
+
+ "(?=\s)|(?<=\s)"
|
79 |
+
+ digits_pattern
|
80 |
+
+ "$"
|
81 |
+
)
|
82 |
+
|
83 |
+
normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
|
84 |
+
|
85 |
+
if config["rm_diacritics"]:
|
86 |
+
from unidecode import unidecode
|
87 |
+
normalized_text = unidecode(normalized_text)
|
88 |
+
|
89 |
+
# Remove extra spaces
|
90 |
+
normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
|
91 |
+
|
92 |
+
return normalized_text
|
zeroshot.py
CHANGED
@@ -9,6 +9,7 @@ import numpy as np
|
|
9 |
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
from torchaudio.models.decoder import ctc_decoder
|
|
|
12 |
|
13 |
uroman_dir = "uroman"
|
14 |
assert os.path.exists(uroman_dir)
|
@@ -94,6 +95,7 @@ def load_words(filepath):
|
|
94 |
with open(filepath) as f:
|
95 |
for line in f:
|
96 |
line = line.strip().lower()
|
|
|
97 |
# ignore invalid words.
|
98 |
for w in line.split():
|
99 |
words.setdefault(w, 0)
|
@@ -109,7 +111,7 @@ def process(
|
|
109 |
lmscore=None,
|
110 |
wscore_usedefault=True,
|
111 |
lmscore_usedefault=True,
|
112 |
-
reference=None
|
113 |
):
|
114 |
transcription, logs = "", MY_LOG()
|
115 |
if not audio_data or not words_file:
|
@@ -169,7 +171,6 @@ def process(
|
|
169 |
|
170 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
171 |
|
172 |
-
|
173 |
if lm_path is None:
|
174 |
yield transcription, logs.add(f"Filtering lexicon....")
|
175 |
lexicon = filter_lexicon(lexicon, word_counts)
|
@@ -219,8 +220,8 @@ def process(
|
|
219 |
yield transcription, logs.add(f"[DONE]")
|
220 |
|
221 |
|
222 |
-
|
223 |
-
|
224 |
|
225 |
|
226 |
# for i in process("upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_5k_sentences.txt"):
|
|
|
9 |
from transformers import Wav2Vec2ForCTC, AutoProcessor
|
10 |
from huggingface_hub import hf_hub_download
|
11 |
from torchaudio.models.decoder import ctc_decoder
|
12 |
+
from normalization.text_norm import text_normalize
|
13 |
|
14 |
uroman_dir = "uroman"
|
15 |
assert os.path.exists(uroman_dir)
|
|
|
95 |
with open(filepath) as f:
|
96 |
for line in f:
|
97 |
line = line.strip().lower()
|
98 |
+
line = text_normalize(line, iso_code="xxx")
|
99 |
# ignore invalid words.
|
100 |
for w in line.split():
|
101 |
words.setdefault(w, 0)
|
|
|
111 |
lmscore=None,
|
112 |
wscore_usedefault=True,
|
113 |
lmscore_usedefault=True,
|
114 |
+
reference=None,
|
115 |
):
|
116 |
transcription, logs = "", MY_LOG()
|
117 |
if not audio_data or not words_file:
|
|
|
171 |
|
172 |
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
|
173 |
|
|
|
174 |
if lm_path is None:
|
175 |
yield transcription, logs.add(f"Filtering lexicon....")
|
176 |
lexicon = filter_lexicon(lexicon, word_counts)
|
|
|
220 |
yield transcription, logs.add(f"[DONE]")
|
221 |
|
222 |
|
223 |
+
for i in process("upload/english/english.mp3", "upload/english/c4_5k_sentences.txt"):
|
224 |
+
print(i)
|
225 |
|
226 |
|
227 |
# for i in process("upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_5k_sentences.txt"):
|