File size: 2,738 Bytes
ed1082e
55d834b
8c4bd28
 
ed1082e
8c4bd28
55d834b
8c4bd28
ed1082e
55d834b
ed1082e
55d834b
 
 
 
 
 
 
8c4bd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19e6f19
8c4bd28
 
 
 
 
 
 
 
 
8d24ba9
8c4bd28
 
8d24ba9
8c4bd28
8d24ba9
8c4bd28
 
8d24ba9
8c4bd28
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from crh_transliterator.transliterator import transliterate
from tabulate import tabulate


def test_transliterator():
    cases = _read_test_cases()
    failed = []
    for case in cases:
        if transliterate(case[1]).lower() != case[0].lower():
            failed.append(
                (case[1].lower(), transliterate(case[1]).lower(), case[0].lower())
            )
    if len(failed) > 0:
        failed_rows = "\n".join([str(item) for item in failed])
        raise Exception(
            f"Failed {len(failed)}/{len(cases)} ({round((len(failed)/len(cases))*100,2)}%) cases.\n"
            + tabulate(failed, headers=["Original", "Converted", "Ground truth"])
        )


def test_letter_coverage():
    """
    Check if all letters are present in a test set.
    """
    latin_alphabet = [
        "a",
        "â",
        "b",
        "c",
        "ç",
        "d",
        "e",
        "f",
        "g",
        "ğ",
        "h",
        "ı",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "ñ",
        "o",
        "ö",
        "p",
        "q",
        "r",
        "s",
        "ş",
        "t",
        "u",
        "ü",
        "v",
        "y",
        "z",
    ]
    cyrillic_alphabet = [
        "а",
        "б",
        "в",
        "г",
        "гъ",
        "д",
        "е",
        "ё",
        "ж",
        "з",
        "и",
        "й",
        "к",
        "къ",
        "л",
        "м",
        "н",
        "нъ",
        "о",
        "п",
        "р",
        "с",
        "т",
        "у",
        "ф",
        "х",
        "ц",
        "ч",
        "дж",
        "ш",
        "щ",
        # "ъ",
        "ы",
        "ь",
        "э",
        "ю",
        "я",
    ]
    cases = _read_test_cases()
    missing_letters = []
    latin_cases = " ".join([case[0] for case in cases]).lower()
    for letter in sorted(latin_alphabet, key=lambda x: len(x), reverse=True):
        if letter not in latin_cases:
            missing_letters.append(letter)
        latin_cases = latin_cases.replace(letter, "")
    cyrillic_cases = " ".join([case[1] for case in cases]).lower()
    for letter in sorted(cyrillic_alphabet, key=lambda x: len(x), reverse=True):
        if letter not in cyrillic_cases:
            missing_letters.append(letter)
        cyrillic_cases = cyrillic_cases.replace(letter, "")
    if len(missing_letters) > 0:
        raise Exception(f"'{missing_letters}' not found in test dataset!")


def _read_test_cases():
    with open("tests/rosetta.csv") as file:
        text = file.read()

    rows = text.split("\n")
    for i in range(0, len(rows)):
        rows[i] = rows[i].split("|")
    return rows