whisper-large-v2-pt-v3 / portuguese_normalizer.py
jlondonobo's picture
Training in progress, step 500
a75ef1e
raw
history blame
2.7 kB
# Modified from OpenAI's Whisper english_normalizer.
import re
import unicodedata
from typing import Iterable
# non-ASCII letters that are not separated by "NFKD" normalization
ADDITIONAL_DIACRITICS = {
"œ": "oe",
"Œ": "OE",
"ø": "o",
"Ø": "O",
"æ": "ae",
"Æ": "AE",
"ß": "ss",
"ẞ": "SS",
"đ": "d",
"Đ": "D",
"ð": "d",
"Ð": "D",
"þ": "th",
"Þ": "th",
"ł": "l",
"Ł": "L",
}
PORTUGUESE_ACCENTED_CHARACTERS = [
"ç",
"á",
"é",
"í",
"ó",
"ú",
"â",
"ê",
"ô",
"ã",
"õ",
"à",
"ò",
"è",
"ì",
"ù"
]
PORTUGUESE_DIACRITICS = ['̧', '̂', '̀', '̃', '́']
def remove_symbols_and_diacritics(s: str, keep: Iterable[str] = "") -> str:
"""
Replace any other markers, symbols, and punctuations with a space,
and drop any diacritics (category 'Mn' and some manual mappings)
"""
return "".join(
c
if c in keep
else ADDITIONAL_DIACRITICS[c]
if c in ADDITIONAL_DIACRITICS
else ""
if unicodedata.category(c) == "Mn"
else " "
if unicodedata.category(c)[0] in "MSP"
else c
for c in unicodedata.normalize("NFKD", s)
)
class PortugueseTextNormalizer:
def __init__(self):
self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh)\b"
self.replacers = {
# contractions in titles/prefixes
r"\bsr\b": "senhor ",
r"\bsra\b": "senhora ",
r"\bsto\b": "santo ",
r"\bsta\b": "santa ",
r"\bdr\b": "doutor ",
r"\bdra\b": "doutora ",
r"\bprof\b": "professor ",
r"\bcap\b": "capitão ",
}
def __call__(self, s: str):
s = s.lower()
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
s = re.sub(self.ignore_patterns, "", s)
for pattern, replacement in self.replacers.items():
s = re.sub(pattern, replacement, s)
# In english, one wold remove commas between digits (thousands separators)
# and periods not followed by digits (decimals). But in portuguese, either comma or period
# can be used as a decimal separator.
s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
s = re.sub(r"(\d)\.(\d)", r"\1\2", s) # remove periods between digits
s = remove_symbols_and_diacritics(s, keep=PORTUGUESE_DIACRITICS)
s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
return s.lower()