Spaces:
Sleeping
Sleeping
import re | |
from num2words import num2words | |
punctuation = r'[\s,.?!/)\'\]>]' | |
alphabet_map = { | |
"A": " Ei ", | |
"B": " Bee ", | |
"C": " See ", | |
"D": " Dee ", | |
"E": " Eee ", | |
"F": " Eff ", | |
"G": " Jee ", | |
"H": " Eich ", | |
"I": " Eye ", | |
"J": " Jay ", | |
"K": " Kay ", | |
"L": " El ", | |
"M": " Emm ", | |
"N": " Enn ", | |
"O": " Ohh ", | |
"P": " Pee ", | |
"Q": " Queue ", | |
"R": " Are ", | |
"S": " Ess ", | |
"T": " Tee ", | |
"U": " You ", | |
"V": " Vee ", | |
"W": " Double You ", | |
"X": " Ex ", | |
"Y": " Why ", | |
"Z": " Zed " # Zed is weird, as I (da3dsoul) am American, but most of the voice models sound British, so it matches | |
} | |
def preprocess(string): | |
# the order for some of these matter | |
# For example, you need to remove the commas in numbers before expanding them | |
string = remove_surrounded_chars(string) | |
string = string.replace('"', '') | |
string = string.replace('\u201D', '').replace('\u201C', '') # right and left quote | |
string = string.replace('\u201F', '') # italic looking quote | |
string = string.replace('\n', ' ') | |
string = convert_num_locale(string) | |
string = replace_negative(string) | |
string = replace_roman(string) | |
string = hyphen_range_to(string) | |
string = num_to_words(string) | |
# TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually | |
# try to say the abbreviation or spell it out as I've done below is not agreed upon | |
# For now, expand abbreviations to pronunciations | |
# replace_abbreviations adds a lot of unnecessary whitespace to ensure separation | |
string = replace_abbreviations(string) | |
string = replace_lowercase_abbreviations(string) | |
# cleanup whitespaces | |
# remove whitespace before punctuation | |
string = re.sub(rf'\s+({punctuation})', r'\1', string) | |
string = string.strip() | |
# compact whitespace | |
string = ' '.join(string.split()) | |
return string | |
def remove_surrounded_chars(string): | |
# first this expression will check if there is a string nested exclusively between a alt= | |
# and a style= string. This would correspond to only a the alt text of an embedded image | |
# If it matches it will only keep that part as the string, and rend it for further processing | |
# Afterwards this expression matches to 'as few symbols as possible (0 upwards) between any | |
# asterisks' OR' as few symbols as possible (0 upwards) between an asterisk and the end of the string' | |
if re.search(r'(?<=alt=)(.*)(?=style=)', string, re.DOTALL): | |
m = re.search(r'(?<=alt=)(.*)(?=style=)', string, re.DOTALL) | |
string = m.group(0) | |
return re.sub(r'\*[^*]*?(\*|$)', '', string) | |
def convert_num_locale(text): | |
# This detects locale and converts it to American without comma separators | |
pattern = re.compile(r'(?:\s|^)\d{1,3}(?:\.\d{3})+(,\d+)(?:\s|$)') | |
result = text | |
while True: | |
match = pattern.search(result) | |
if match is None: | |
break | |
start = match.start() | |
end = match.end() | |
result = result[0:start] + result[start:end].replace('.', '').replace(',', '.') + result[end:len(result)] | |
# removes comma separators from existing American numbers | |
pattern = re.compile(r'(\d),(\d)') | |
result = pattern.sub(r'\1\2', result) | |
return result | |
def replace_negative(string): | |
# handles situations like -5. -5 would become negative 5, which would then be expanded to negative five | |
return re.sub(rf'(\s)(-)(\d+)({punctuation})', r'\1negative \3\4', string) | |
def replace_roman(string): | |
# find a string of roman numerals. | |
# Only 2 or more, to avoid capturing I and single character abbreviations, like names | |
pattern = re.compile(rf'\s[IVXLCDM]{{2,}}{punctuation}') | |
result = string | |
while True: | |
match = pattern.search(result) | |
if match is None: | |
break | |
start = match.start() | |
end = match.end() | |
result = result[0:start + 1] + str(roman_to_int(result[start + 1:end - 1])) + result[end - 1:len(result)] | |
return result | |
def roman_to_int(s): | |
rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} | |
int_val = 0 | |
for i in range(len(s)): | |
if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]: | |
int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]] | |
else: | |
int_val += rom_val[s[i]] | |
return int_val | |
def hyphen_range_to(text): | |
pattern = re.compile(r'(\d+)[-–](\d+)') | |
result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text) | |
return result | |
def num_to_words(text): | |
# 1000 or 10.23 | |
pattern = re.compile(r'\d+\.\d+|\d+') | |
result = pattern.sub(lambda x: num2words(float(x.group())), text) | |
return result | |
def replace_abbreviations(string): | |
# abbreviations 1 to 4 characters long. It will get things like A and I, but those are pronounced with their letter | |
pattern = re.compile(rf'(^|[\s(.\'\[<])([A-Z]{{1,4}})({punctuation}|$)') | |
result = string | |
while True: | |
match = pattern.search(result) | |
if match is None: | |
break | |
start = match.start() | |
end = match.end() | |
result = result[0:start] + replace_abbreviation(result[start:end]) + result[end:len(result)] | |
return result | |
def replace_lowercase_abbreviations(string): | |
# abbreviations 1 to 4 characters long, separated by dots i.e. e.g. | |
pattern = re.compile(rf'(^|[\s(.\'\[<])(([a-z]\.){{1,4}})({punctuation}|$)') | |
result = string | |
while True: | |
match = pattern.search(result) | |
if match is None: | |
break | |
start = match.start() | |
end = match.end() | |
result = result[0:start] + replace_abbreviation(result[start:end].upper()) + result[end:len(result)] | |
return result | |
def replace_abbreviation(string): | |
result = "" | |
for char in string: | |
result += match_mapping(char) | |
return result | |
def match_mapping(char): | |
for mapping in alphabet_map.keys(): | |
if char == mapping: | |
return alphabet_map[char] | |
return char | |
def __main__(args): | |
print(preprocess(args[1])) | |
if __name__ == "__main__": | |
import sys | |
__main__(sys.argv) | |