File size: 3,355 Bytes
8c4bd28
ed1082e
55d834b
8c4bd28
 
ed1082e
8c4bd28
ed1082e
8c4bd28
ed1082e
 
 
 
 
 
 
 
 
 
8c4bd28
 
ed1082e
8c4bd28
55d834b
8c4bd28
ed1082e
55d834b
ed1082e
55d834b
 
 
 
 
 
 
8c4bd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d24ba9
8c4bd28
 
8d24ba9
8c4bd28
8d24ba9
8c4bd28
 
8d24ba9
8c4bd28
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import converter
from crh_transliterator.transliterator import transliterate
from tabulate import tabulate


def test_latin_converter():
    cases = _read_test_cases()
    failed = []
    for case in cases:
        if converter.to_latin(case[1]).lower() != case[0].lower():
            failed.append(
                (case[1].lower(), converter.to_latin(case[1]).lower(), case[0].lower())
            )
    if len(failed) > 0:
        failed_rows = "\n".join([str(item) for item in failed])
        raise Exception(
            f"Failed {len(failed)}/{len(cases)} ({round((len(failed)/len(cases))*100,2)}%) cases.\n"
            + tabulate(failed, headers=["Original", "Converted", "Ground truth"])
        )


def test_transliterator():
    cases = _read_test_cases()
    failed = []
    for case in cases:
        if transliterate(case[1]).lower() != case[0].lower():
            failed.append(
                (case[1].lower(), transliterate(case[1]).lower(), case[0].lower())
            )
    if len(failed) > 0:
        failed_rows = "\n".join([str(item) for item in failed])
        raise Exception(
            f"Failed {len(failed)}/{len(cases)} ({round((len(failed)/len(cases))*100,2)}%) cases.\n"
            + tabulate(failed, headers=["Original", "Converted", "Ground truth"])
        )


def test_letter_coverage():
    """
    Check if all letters are present in a test set.
    """
    latin_alphabet = [
        "a",
        "â",
        "b",
        "c",
        "ç",
        "d",
        "e",
        "f",
        "g",
        "ğ",
        "h",
        "ı",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "ñ",
        "o",
        "ö",
        "p",
        "q",
        "r",
        "s",
        "ş",
        "t",
        "u",
        "ü",
        "v",
        "y",
        "z",
    ]
    cyrillic_alphabet = [
        "а",
        "б",
        "в",
        "г",
        "гъ",
        "д",
        "е",
        "ё",
        "ж",
        "з",
        "и",
        "й",
        "к",
        "къ",
        "л",
        "м",
        "н",
        "нъ",
        "о",
        "п",
        "р",
        "с",
        "т",
        "у",
        "ф",
        "х",
        "ц",
        "ч",
        "дж",
        "ш",
        "щ",
        "ъ",
        "ы",
        "ь",
        "э",
        "ю",
        "я",
    ]
    cases = _read_test_cases()
    missing_letters = []
    latin_cases = " ".join([case[0] for case in cases]).lower()
    for letter in sorted(latin_alphabet, key=lambda x: len(x), reverse=True):
        if letter not in latin_cases:
            missing_letters.append(letter)
        latin_cases = latin_cases.replace(letter, "")
    cyrillic_cases = " ".join([case[1] for case in cases]).lower()
    for letter in sorted(cyrillic_alphabet, key=lambda x: len(x), reverse=True):
        if letter not in cyrillic_cases:
            missing_letters.append(letter)
        cyrillic_cases = cyrillic_cases.replace(letter, "")
    if len(missing_letters) > 0:
        raise Exception(f"'{missing_letters}' not found in test dataset!")


def _read_test_cases():
    with open("tests/rosetta.csv") as file:
        text = file.read()

    rows = text.split("\n")
    for i in range(0, len(rows)):
        rows[i] = rows[i].split("|")
    return rows