Spaces:
Running
Running
File size: 2,738 Bytes
ed1082e 55d834b 8c4bd28 ed1082e 8c4bd28 55d834b 8c4bd28 ed1082e 55d834b ed1082e 55d834b 8c4bd28 19e6f19 8c4bd28 8d24ba9 8c4bd28 8d24ba9 8c4bd28 8d24ba9 8c4bd28 8d24ba9 8c4bd28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
from crh_transliterator.transliterator import transliterate
from tabulate import tabulate
def test_transliterator():
cases = _read_test_cases()
failed = []
for case in cases:
if transliterate(case[1]).lower() != case[0].lower():
failed.append(
(case[1].lower(), transliterate(case[1]).lower(), case[0].lower())
)
if len(failed) > 0:
failed_rows = "\n".join([str(item) for item in failed])
raise Exception(
f"Failed {len(failed)}/{len(cases)} ({round((len(failed)/len(cases))*100,2)}%) cases.\n"
+ tabulate(failed, headers=["Original", "Converted", "Ground truth"])
)
def test_letter_coverage():
"""
Check if all letters are present in a test set.
"""
latin_alphabet = [
"a",
"â",
"b",
"c",
"ç",
"d",
"e",
"f",
"g",
"ğ",
"h",
"ı",
"i",
"j",
"k",
"l",
"m",
"n",
"ñ",
"o",
"ö",
"p",
"q",
"r",
"s",
"ş",
"t",
"u",
"ü",
"v",
"y",
"z",
]
cyrillic_alphabet = [
"а",
"б",
"в",
"г",
"гъ",
"д",
"е",
"ё",
"ж",
"з",
"и",
"й",
"к",
"къ",
"л",
"м",
"н",
"нъ",
"о",
"п",
"р",
"с",
"т",
"у",
"ф",
"х",
"ц",
"ч",
"дж",
"ш",
"щ",
# "ъ",
"ы",
"ь",
"э",
"ю",
"я",
]
cases = _read_test_cases()
missing_letters = []
latin_cases = " ".join([case[0] for case in cases]).lower()
for letter in sorted(latin_alphabet, key=lambda x: len(x), reverse=True):
if letter not in latin_cases:
missing_letters.append(letter)
latin_cases = latin_cases.replace(letter, "")
cyrillic_cases = " ".join([case[1] for case in cases]).lower()
for letter in sorted(cyrillic_alphabet, key=lambda x: len(x), reverse=True):
if letter not in cyrillic_cases:
missing_letters.append(letter)
cyrillic_cases = cyrillic_cases.replace(letter, "")
if len(missing_letters) > 0:
raise Exception(f"'{missing_letters}' not found in test dataset!")
def _read_test_cases():
with open("tests/rosetta.csv") as file:
text = file.read()
rows = text.split("\n")
for i in range(0, len(rows)):
rows[i] = rows[i].split("|")
return rows
|