File size: 2,848 Bytes
8c4bd28
55d834b
8c4bd28
 
 
 
 
 
 
 
 
 
55d834b
8c4bd28
55d834b
 
 
 
 
 
 
 
 
 
8c4bd28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d24ba9
8c4bd28
 
8d24ba9
8c4bd28
8d24ba9
8c4bd28
 
8d24ba9
8c4bd28
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import converter
from tabulate import tabulate


def test_cyrillic_converter():
    cases = _read_test_cases()
    for case in cases:
        assert converter.to_cyrillic(case[0]) == case[1]


def test_latin_converter():
    cases = _read_test_cases()
    failed = []
    for case in cases:
        if converter.to_latin(case[1]).lower() != case[0].lower():
            failed.append(
                (case[1].lower(), converter.to_latin(case[1]).lower(), case[0].lower())
            )
    if len(failed) > 0:
        failed_rows = "\n".join([str(item) for item in failed])
        raise Exception(
            f"Failed {len(failed)}/{len(cases)} ({round((len(failed)/len(cases))*100,2)}%) cases.\n"
            + tabulate(failed, headers=["Original", "Converted", "Ground truth"])
        )


def test_letter_coverage():
    """
    Check if all letters are present in a test set.
    """
    latin_alphabet = [
        "a",
        "â",
        "b",
        "c",
        "ç",
        "d",
        "e",
        "f",
        "g",
        "ğ",
        "h",
        "ı",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "ñ",
        "o",
        "ö",
        "p",
        "q",
        "r",
        "s",
        "ş",
        "t",
        "u",
        "ü",
        "v",
        "y",
        "z",
    ]
    cyrillic_alphabet = [
        "а",
        "б",
        "в",
        "г",
        "гъ",
        "д",
        "е",
        "ё",
        "ж",
        "з",
        "и",
        "й",
        "к",
        "къ",
        "л",
        "м",
        "н",
        "нъ",
        "о",
        "п",
        "р",
        "с",
        "т",
        "у",
        "ф",
        "х",
        "ц",
        "ч",
        "дж",
        "ш",
        "щ",
        "ъ",
        "ы",
        "ь",
        "э",
        "ю",
        "я",
    ]
    cases = _read_test_cases()
    missing_letters = []
    latin_cases = " ".join([case[0] for case in cases]).lower()
    for letter in sorted(latin_alphabet, key=lambda x: len(x), reverse=True):
        if letter not in latin_cases:
            missing_letters.append(letter)
        latin_cases = latin_cases.replace(letter, "")
    cyrillic_cases = " ".join([case[1] for case in cases]).lower()
    for letter in sorted(cyrillic_alphabet, key=lambda x: len(x), reverse=True):
        if letter not in cyrillic_cases:
            missing_letters.append(letter)
        cyrillic_cases = cyrillic_cases.replace(letter, "")
    if len(missing_letters) > 0:
        raise Exception(f"'{missing_letters}' not found in test dataset!")


def _read_test_cases():
    with open("tests/rosetta.csv") as file:
        text = file.read()

    rows = text.split("\n")
    for i in range(0, len(rows)):
        rows[i] = rows[i].split("|")
    return rows