File size: 339 Bytes
19c634e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import re

mappings = [
    ["\u0069\u0307", "i"],  # ["i̇", "i"], handle i with dot, first occurence is two separate symbols
]

def preprocess(text):
    text = text.lower()  # always treat lowercase
    text = " " + text + " "

    for mapping in mappings:
        text = re.sub(mapping[0], mapping[1], text)

    return text[1:-1]