Spaces:
Running
Running
File size: 3,355 Bytes
8c4bd28 ed1082e 55d834b 8c4bd28 ed1082e 8c4bd28 ed1082e 8c4bd28 ed1082e 8c4bd28 ed1082e 8c4bd28 55d834b 8c4bd28 ed1082e 55d834b ed1082e 55d834b 8c4bd28 8d24ba9 8c4bd28 8d24ba9 8c4bd28 8d24ba9 8c4bd28 8d24ba9 8c4bd28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import converter
from crh_transliterator.transliterator import transliterate
from tabulate import tabulate
def test_latin_converter():
cases = _read_test_cases()
failed = []
for case in cases:
if converter.to_latin(case[1]).lower() != case[0].lower():
failed.append(
(case[1].lower(), converter.to_latin(case[1]).lower(), case[0].lower())
)
if len(failed) > 0:
failed_rows = "\n".join([str(item) for item in failed])
raise Exception(
f"Failed {len(failed)}/{len(cases)} ({round((len(failed)/len(cases))*100,2)}%) cases.\n"
+ tabulate(failed, headers=["Original", "Converted", "Ground truth"])
)
def test_transliterator():
cases = _read_test_cases()
failed = []
for case in cases:
if transliterate(case[1]).lower() != case[0].lower():
failed.append(
(case[1].lower(), transliterate(case[1]).lower(), case[0].lower())
)
if len(failed) > 0:
failed_rows = "\n".join([str(item) for item in failed])
raise Exception(
f"Failed {len(failed)}/{len(cases)} ({round((len(failed)/len(cases))*100,2)}%) cases.\n"
+ tabulate(failed, headers=["Original", "Converted", "Ground truth"])
)
def test_letter_coverage():
"""
Check if all letters are present in a test set.
"""
latin_alphabet = [
"a",
"â",
"b",
"c",
"ç",
"d",
"e",
"f",
"g",
"ğ",
"h",
"ı",
"i",
"j",
"k",
"l",
"m",
"n",
"ñ",
"o",
"ö",
"p",
"q",
"r",
"s",
"ş",
"t",
"u",
"ü",
"v",
"y",
"z",
]
cyrillic_alphabet = [
"а",
"б",
"в",
"г",
"гъ",
"д",
"е",
"ё",
"ж",
"з",
"и",
"й",
"к",
"къ",
"л",
"м",
"н",
"нъ",
"о",
"п",
"р",
"с",
"т",
"у",
"ф",
"х",
"ц",
"ч",
"дж",
"ш",
"щ",
"ъ",
"ы",
"ь",
"э",
"ю",
"я",
]
cases = _read_test_cases()
missing_letters = []
latin_cases = " ".join([case[0] for case in cases]).lower()
for letter in sorted(latin_alphabet, key=lambda x: len(x), reverse=True):
if letter not in latin_cases:
missing_letters.append(letter)
latin_cases = latin_cases.replace(letter, "")
cyrillic_cases = " ".join([case[1] for case in cases]).lower()
for letter in sorted(cyrillic_alphabet, key=lambda x: len(x), reverse=True):
if letter not in cyrillic_cases:
missing_letters.append(letter)
cyrillic_cases = cyrillic_cases.replace(letter, "")
if len(missing_letters) > 0:
raise Exception(f"'{missing_letters}' not found in test dataset!")
def _read_test_cases():
with open("tests/rosetta.csv") as file:
text = file.read()
rows = text.split("\n")
for i in range(0, len(rows)):
rows[i] = rows[i].split("|")
return rows
|