from crh_transliterator.transliterator import transliterate from tabulate import tabulate def test_transliterator(): cases = _read_test_cases() failed = [] for case in cases: if transliterate(case[1]).lower() != case[0].lower(): failed.append( (case[1].lower(), transliterate(case[1]).lower(), case[0].lower()) ) if len(failed) > 0: failed_rows = "\n".join([str(item) for item in failed]) raise Exception( f"Failed {len(failed)}/{len(cases)} ({round((len(failed)/len(cases))*100,2)}%) cases.\n" + tabulate(failed, headers=["Original", "Converted", "Ground truth"]) ) def test_letter_coverage(): """ Check if all letters are present in a test set. """ latin_alphabet = [ "a", "â", "b", "c", "ç", "d", "e", "f", "g", "ğ", "h", "ı", "i", "j", "k", "l", "m", "n", "ñ", "o", "ö", "p", "q", "r", "s", "ş", "t", "u", "ü", "v", "y", "z", ] cyrillic_alphabet = [ "а", "б", "в", "г", "гъ", "д", "е", "ё", "ж", "з", "и", "й", "к", "къ", "л", "м", "н", "нъ", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "дж", "ш", "щ", "ъ", "ы", "ь", "э", "ю", "я", ] cases = _read_test_cases() missing_letters = [] latin_cases = " ".join([case[0] for case in cases]).lower() for letter in sorted(latin_alphabet, key=lambda x: len(x), reverse=True): if letter not in latin_cases: missing_letters.append(letter) latin_cases = latin_cases.replace(letter, "") cyrillic_cases = " ".join([case[1] for case in cases]).lower() for letter in sorted(cyrillic_alphabet, key=lambda x: len(x), reverse=True): if letter not in cyrillic_cases: missing_letters.append(letter) cyrillic_cases = cyrillic_cases.replace(letter, "") if len(missing_letters) > 0: raise Exception(f"'{missing_letters}' not found in test dataset!") def _read_test_cases(): with open("tests/rosetta.csv") as file: text = file.read() rows = text.split("\n") for i in range(0, len(rows)): rows[i] = rows[i].split("|") return rows