added khmer, tibetan and lao
Browse files- data/Austro-Asiatic.json +77 -3
- data/Kra-Dai.json +33 -2
- data/Sino-Tibetan.json +86 -7
data/Austro-Asiatic.json
CHANGED
@@ -813,6 +813,13 @@
|
|
813 |
"children": [],
|
814 |
"family": "Austro-Asiatic",
|
815 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
816 |
"Latn": {
|
817 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
818 |
"original_lang_name": "vietnamese",
|
@@ -855,6 +862,13 @@
|
|
855 |
],
|
856 |
"family": "Austro-Asiatic",
|
857 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
858 |
"Latn": {
|
859 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
860 |
"original_lang_name": "vietnamese",
|
@@ -870,6 +884,13 @@
|
|
870 |
],
|
871 |
"family": "Austro-Asiatic",
|
872 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
873 |
"Latn": {
|
874 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
875 |
"original_lang_name": "vietnamese",
|
@@ -922,6 +943,13 @@
|
|
922 |
],
|
923 |
"family": "Austro-Asiatic",
|
924 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
925 |
"Latn": {
|
926 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
927 |
"original_lang_name": "vietnamese",
|
@@ -986,6 +1014,13 @@
|
|
986 |
],
|
987 |
"family": "Austro-Asiatic",
|
988 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
989 |
"Latn": {
|
990 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
991 |
"original_lang_name": "vietnamese",
|
@@ -1174,6 +1209,13 @@
|
|
1174 |
],
|
1175 |
"family": "Austro-Asiatic",
|
1176 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1177 |
"Latn": {
|
1178 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
1179 |
"original_lang_name": "vietnamese",
|
@@ -1568,9 +1610,19 @@
|
|
1568 |
"iso_3_code": "khm",
|
1569 |
"children": [],
|
1570 |
"family": "Austro-Asiatic",
|
1571 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1572 |
"node_i": "1283",
|
1573 |
-
"native_tokenizers": [
|
|
|
|
|
1574 |
"scripts": [
|
1575 |
"Khmr"
|
1576 |
]
|
@@ -1591,7 +1643,15 @@
|
|
1591 |
}
|
1592 |
],
|
1593 |
"family": "Austro-Asiatic",
|
1594 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1595 |
"node_i": "1282",
|
1596 |
"native_tokenizers": [],
|
1597 |
"scripts": []
|
@@ -1749,6 +1809,13 @@
|
|
1749 |
],
|
1750 |
"family": "Austro-Asiatic",
|
1751 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1752 |
"Latn": {
|
1753 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
1754 |
"original_lang_name": "vietnamese",
|
@@ -3141,6 +3208,13 @@
|
|
3141 |
],
|
3142 |
"family": "Austro-Asiatic",
|
3143 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3144 |
"Latn": {
|
3145 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
3146 |
"original_lang_name": "vietnamese",
|
|
|
813 |
"children": [],
|
814 |
"family": "Austro-Asiatic",
|
815 |
"tokenizers": {
|
816 |
+
"Khmr": {
|
817 |
+
"full_object": "KhmerTokenizer()",
|
818 |
+
"original_lang_name": "khmer",
|
819 |
+
"original_lang_code": "khm",
|
820 |
+
"script": "Khmr",
|
821 |
+
"class_name": "KhmerTokenizer"
|
822 |
+
},
|
823 |
"Latn": {
|
824 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
825 |
"original_lang_name": "vietnamese",
|
|
|
862 |
],
|
863 |
"family": "Austro-Asiatic",
|
864 |
"tokenizers": {
|
865 |
+
"Khmr": {
|
866 |
+
"full_object": "KhmerTokenizer()",
|
867 |
+
"original_lang_name": "khmer",
|
868 |
+
"original_lang_code": "khm",
|
869 |
+
"script": "Khmr",
|
870 |
+
"class_name": "KhmerTokenizer"
|
871 |
+
},
|
872 |
"Latn": {
|
873 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
874 |
"original_lang_name": "vietnamese",
|
|
|
884 |
],
|
885 |
"family": "Austro-Asiatic",
|
886 |
"tokenizers": {
|
887 |
+
"Khmr": {
|
888 |
+
"full_object": "KhmerTokenizer()",
|
889 |
+
"original_lang_name": "khmer",
|
890 |
+
"original_lang_code": "khm",
|
891 |
+
"script": "Khmr",
|
892 |
+
"class_name": "KhmerTokenizer"
|
893 |
+
},
|
894 |
"Latn": {
|
895 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
896 |
"original_lang_name": "vietnamese",
|
|
|
943 |
],
|
944 |
"family": "Austro-Asiatic",
|
945 |
"tokenizers": {
|
946 |
+
"Khmr": {
|
947 |
+
"full_object": "KhmerTokenizer()",
|
948 |
+
"original_lang_name": "khmer",
|
949 |
+
"original_lang_code": "khm",
|
950 |
+
"script": "Khmr",
|
951 |
+
"class_name": "KhmerTokenizer"
|
952 |
+
},
|
953 |
"Latn": {
|
954 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
955 |
"original_lang_name": "vietnamese",
|
|
|
1014 |
],
|
1015 |
"family": "Austro-Asiatic",
|
1016 |
"tokenizers": {
|
1017 |
+
"Khmr": {
|
1018 |
+
"full_object": "KhmerTokenizer()",
|
1019 |
+
"original_lang_name": "khmer",
|
1020 |
+
"original_lang_code": "khm",
|
1021 |
+
"script": "Khmr",
|
1022 |
+
"class_name": "KhmerTokenizer"
|
1023 |
+
},
|
1024 |
"Latn": {
|
1025 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
1026 |
"original_lang_name": "vietnamese",
|
|
|
1209 |
],
|
1210 |
"family": "Austro-Asiatic",
|
1211 |
"tokenizers": {
|
1212 |
+
"Khmr": {
|
1213 |
+
"full_object": "KhmerTokenizer()",
|
1214 |
+
"original_lang_name": "khmer",
|
1215 |
+
"original_lang_code": "khm",
|
1216 |
+
"script": "Khmr",
|
1217 |
+
"class_name": "KhmerTokenizer"
|
1218 |
+
},
|
1219 |
"Latn": {
|
1220 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
1221 |
"original_lang_name": "vietnamese",
|
|
|
1610 |
"iso_3_code": "khm",
|
1611 |
"children": [],
|
1612 |
"family": "Austro-Asiatic",
|
1613 |
+
"tokenizers": {
|
1614 |
+
"Khmr": {
|
1615 |
+
"full_object": "KhmerTokenizer()",
|
1616 |
+
"original_lang_name": "khmer",
|
1617 |
+
"original_lang_code": "khm",
|
1618 |
+
"script": "Khmr",
|
1619 |
+
"class_name": "KhmerTokenizer"
|
1620 |
+
}
|
1621 |
+
},
|
1622 |
"node_i": "1283",
|
1623 |
+
"native_tokenizers": [
|
1624 |
+
"Khmr"
|
1625 |
+
],
|
1626 |
"scripts": [
|
1627 |
"Khmr"
|
1628 |
]
|
|
|
1643 |
}
|
1644 |
],
|
1645 |
"family": "Austro-Asiatic",
|
1646 |
+
"tokenizers": {
|
1647 |
+
"Khmr": {
|
1648 |
+
"full_object": "KhmerTokenizer()",
|
1649 |
+
"original_lang_name": "khmer",
|
1650 |
+
"original_lang_code": "khm",
|
1651 |
+
"script": "Khmr",
|
1652 |
+
"class_name": "KhmerTokenizer"
|
1653 |
+
}
|
1654 |
+
},
|
1655 |
"node_i": "1282",
|
1656 |
"native_tokenizers": [],
|
1657 |
"scripts": []
|
|
|
1809 |
],
|
1810 |
"family": "Austro-Asiatic",
|
1811 |
"tokenizers": {
|
1812 |
+
"Khmr": {
|
1813 |
+
"full_object": "KhmerTokenizer()",
|
1814 |
+
"original_lang_name": "khmer",
|
1815 |
+
"original_lang_code": "khm",
|
1816 |
+
"script": "Khmr",
|
1817 |
+
"class_name": "KhmerTokenizer"
|
1818 |
+
},
|
1819 |
"Latn": {
|
1820 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
1821 |
"original_lang_name": "vietnamese",
|
|
|
3208 |
],
|
3209 |
"family": "Austro-Asiatic",
|
3210 |
"tokenizers": {
|
3211 |
+
"Khmr": {
|
3212 |
+
"full_object": "KhmerTokenizer()",
|
3213 |
+
"original_lang_name": "khmer",
|
3214 |
+
"original_lang_code": "khm",
|
3215 |
+
"script": "Khmr",
|
3216 |
+
"class_name": "KhmerTokenizer"
|
3217 |
+
},
|
3218 |
"Latn": {
|
3219 |
"full_object": "SpaCyTokenizer(\"vi\")",
|
3220 |
"original_lang_name": "vietnamese",
|
data/Kra-Dai.json
CHANGED
@@ -722,9 +722,19 @@
|
|
722 |
"iso_3_code": "lao",
|
723 |
"children": [],
|
724 |
"family": "Kra-Dai",
|
725 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
"node_i": "4725",
|
727 |
-
"native_tokenizers": [
|
|
|
|
|
728 |
"scripts": [
|
729 |
"Laoo"
|
730 |
]
|
@@ -1020,6 +1030,13 @@
|
|
1020 |
],
|
1021 |
"family": "Kra-Dai",
|
1022 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1023 |
"Thai": {
|
1024 |
"full_object": "ThaiTokenizer()",
|
1025 |
"original_lang_name": "thai",
|
@@ -1035,6 +1052,13 @@
|
|
1035 |
],
|
1036 |
"family": "Kra-Dai",
|
1037 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1038 |
"Thai": {
|
1039 |
"full_object": "ThaiTokenizer()",
|
1040 |
"original_lang_name": "thai",
|
@@ -1050,6 +1074,13 @@
|
|
1050 |
],
|
1051 |
"family": "Kra-Dai",
|
1052 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1053 |
"Thai": {
|
1054 |
"full_object": "ThaiTokenizer()",
|
1055 |
"original_lang_name": "thai",
|
|
|
722 |
"iso_3_code": "lao",
|
723 |
"children": [],
|
724 |
"family": "Kra-Dai",
|
725 |
+
"tokenizers": {
|
726 |
+
"Laoo": {
|
727 |
+
"full_object": "LaoTokenizer()",
|
728 |
+
"original_lang_name": "lao",
|
729 |
+
"original_lang_code": "lao",
|
730 |
+
"script": "Laoo",
|
731 |
+
"class_name": "LaoTokenizer"
|
732 |
+
}
|
733 |
+
},
|
734 |
"node_i": "4725",
|
735 |
+
"native_tokenizers": [
|
736 |
+
"Laoo"
|
737 |
+
],
|
738 |
"scripts": [
|
739 |
"Laoo"
|
740 |
]
|
|
|
1030 |
],
|
1031 |
"family": "Kra-Dai",
|
1032 |
"tokenizers": {
|
1033 |
+
"Laoo": {
|
1034 |
+
"full_object": "LaoTokenizer()",
|
1035 |
+
"original_lang_name": "lao",
|
1036 |
+
"original_lang_code": "lao",
|
1037 |
+
"script": "Laoo",
|
1038 |
+
"class_name": "LaoTokenizer"
|
1039 |
+
},
|
1040 |
"Thai": {
|
1041 |
"full_object": "ThaiTokenizer()",
|
1042 |
"original_lang_name": "thai",
|
|
|
1052 |
],
|
1053 |
"family": "Kra-Dai",
|
1054 |
"tokenizers": {
|
1055 |
+
"Laoo": {
|
1056 |
+
"full_object": "LaoTokenizer()",
|
1057 |
+
"original_lang_name": "lao",
|
1058 |
+
"original_lang_code": "lao",
|
1059 |
+
"script": "Laoo",
|
1060 |
+
"class_name": "LaoTokenizer"
|
1061 |
+
},
|
1062 |
"Thai": {
|
1063 |
"full_object": "ThaiTokenizer()",
|
1064 |
"original_lang_name": "thai",
|
|
|
1074 |
],
|
1075 |
"family": "Kra-Dai",
|
1076 |
"tokenizers": {
|
1077 |
+
"Laoo": {
|
1078 |
+
"full_object": "LaoTokenizer()",
|
1079 |
+
"original_lang_name": "lao",
|
1080 |
+
"original_lang_code": "lao",
|
1081 |
+
"script": "Laoo",
|
1082 |
+
"class_name": "LaoTokenizer"
|
1083 |
+
},
|
1084 |
"Thai": {
|
1085 |
"full_object": "ThaiTokenizer()",
|
1086 |
"original_lang_name": "thai",
|
data/Sino-Tibetan.json
CHANGED
@@ -5266,9 +5266,19 @@
|
|
5266 |
"iso_3_code": "bod",
|
5267 |
"children": [],
|
5268 |
"family": "Sino-Tibetan",
|
5269 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5270 |
"node_i": "9329",
|
5271 |
-
"native_tokenizers": [
|
|
|
|
|
5272 |
"scripts": [
|
5273 |
"Tibt"
|
5274 |
]
|
@@ -5509,9 +5519,19 @@
|
|
5509 |
"iso_3_code": "dzo",
|
5510 |
"children": [],
|
5511 |
"family": "Sino-Tibetan",
|
5512 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5513 |
"node_i": "9348",
|
5514 |
-
"native_tokenizers": [
|
|
|
|
|
5515 |
"scripts": [
|
5516 |
"Tibt"
|
5517 |
]
|
@@ -5590,7 +5610,15 @@
|
|
5590 |
}
|
5591 |
],
|
5592 |
"family": "Sino-Tibetan",
|
5593 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5594 |
"node_i": "9345",
|
5595 |
"native_tokenizers": [],
|
5596 |
"scripts": []
|
@@ -5647,6 +5675,13 @@
|
|
5647 |
],
|
5648 |
"family": "Sino-Tibetan",
|
5649 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5650 |
"Deva": {
|
5651 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
5652 |
"original_lang_name": "bodo",
|
@@ -5758,7 +5793,15 @@
|
|
5758 |
"iso_3_code": "lbj",
|
5759 |
"children": [],
|
5760 |
"family": "Sino-Tibetan",
|
5761 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5762 |
"node_i": "9367",
|
5763 |
"native_tokenizers": [],
|
5764 |
"scripts": [
|
@@ -5791,7 +5834,15 @@
|
|
5791 |
}
|
5792 |
],
|
5793 |
"family": "Sino-Tibetan",
|
5794 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5795 |
"node_i": "9364",
|
5796 |
"native_tokenizers": [],
|
5797 |
"scripts": []
|
@@ -5799,6 +5850,13 @@
|
|
5799 |
],
|
5800 |
"family": "Sino-Tibetan",
|
5801 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5802 |
"Deva": {
|
5803 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
5804 |
"original_lang_name": "bodo",
|
@@ -6464,6 +6522,13 @@
|
|
6464 |
],
|
6465 |
"family": "Sino-Tibetan",
|
6466 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6467 |
"Deva": {
|
6468 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
6469 |
"original_lang_name": "bodo",
|
@@ -7310,6 +7375,13 @@
|
|
7310 |
],
|
7311 |
"family": "Sino-Tibetan",
|
7312 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7313 |
"Deva": {
|
7314 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
7315 |
"original_lang_name": "bodo",
|
@@ -7331,6 +7403,13 @@
|
|
7331 |
"original_lang_code": "brx",
|
7332 |
"script": "Deva",
|
7333 |
"class_name": "IndicNLPTokenizer"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7334 |
}
|
7335 |
},
|
7336 |
"node_i": "8937",
|
|
|
5266 |
"iso_3_code": "bod",
|
5267 |
"children": [],
|
5268 |
"family": "Sino-Tibetan",
|
5269 |
+
"tokenizers": {
|
5270 |
+
"Tibt": {
|
5271 |
+
"full_object": "TibetanTokenizer()",
|
5272 |
+
"original_lang_name": "tibetan",
|
5273 |
+
"original_lang_code": "bod",
|
5274 |
+
"script": "Tibt",
|
5275 |
+
"class_name": "TibetanTokenizer"
|
5276 |
+
}
|
5277 |
+
},
|
5278 |
"node_i": "9329",
|
5279 |
+
"native_tokenizers": [
|
5280 |
+
"Tibt"
|
5281 |
+
],
|
5282 |
"scripts": [
|
5283 |
"Tibt"
|
5284 |
]
|
|
|
5519 |
"iso_3_code": "dzo",
|
5520 |
"children": [],
|
5521 |
"family": "Sino-Tibetan",
|
5522 |
+
"tokenizers": {
|
5523 |
+
"Tibt": {
|
5524 |
+
"full_object": "TibetanTokenizer()",
|
5525 |
+
"original_lang_name": "dzongkha",
|
5526 |
+
"original_lang_code": "dzo",
|
5527 |
+
"script": "Tibt",
|
5528 |
+
"class_name": "TibetanTokenizer"
|
5529 |
+
}
|
5530 |
+
},
|
5531 |
"node_i": "9348",
|
5532 |
+
"native_tokenizers": [
|
5533 |
+
"Tibt"
|
5534 |
+
],
|
5535 |
"scripts": [
|
5536 |
"Tibt"
|
5537 |
]
|
|
|
5610 |
}
|
5611 |
],
|
5612 |
"family": "Sino-Tibetan",
|
5613 |
+
"tokenizers": {
|
5614 |
+
"Tibt": {
|
5615 |
+
"full_object": "TibetanTokenizer()",
|
5616 |
+
"original_lang_name": "dzongkha",
|
5617 |
+
"original_lang_code": "dzo",
|
5618 |
+
"script": "Tibt",
|
5619 |
+
"class_name": "TibetanTokenizer"
|
5620 |
+
}
|
5621 |
+
},
|
5622 |
"node_i": "9345",
|
5623 |
"native_tokenizers": [],
|
5624 |
"scripts": []
|
|
|
5675 |
],
|
5676 |
"family": "Sino-Tibetan",
|
5677 |
"tokenizers": {
|
5678 |
+
"Tibt": {
|
5679 |
+
"full_object": "TibetanTokenizer()",
|
5680 |
+
"original_lang_name": "tibetan",
|
5681 |
+
"original_lang_code": "bod",
|
5682 |
+
"script": "Tibt",
|
5683 |
+
"class_name": "TibetanTokenizer"
|
5684 |
+
},
|
5685 |
"Deva": {
|
5686 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
5687 |
"original_lang_name": "bodo",
|
|
|
5793 |
"iso_3_code": "lbj",
|
5794 |
"children": [],
|
5795 |
"family": "Sino-Tibetan",
|
5796 |
+
"tokenizers": {
|
5797 |
+
"Tibt": {
|
5798 |
+
"full_object": "TibetanTokenizer()",
|
5799 |
+
"original_lang_name": "tibetan",
|
5800 |
+
"original_lang_code": "bod",
|
5801 |
+
"script": "Tibt",
|
5802 |
+
"class_name": "TibetanTokenizer"
|
5803 |
+
}
|
5804 |
+
},
|
5805 |
"node_i": "9367",
|
5806 |
"native_tokenizers": [],
|
5807 |
"scripts": [
|
|
|
5834 |
}
|
5835 |
],
|
5836 |
"family": "Sino-Tibetan",
|
5837 |
+
"tokenizers": {
|
5838 |
+
"Tibt": {
|
5839 |
+
"full_object": "TibetanTokenizer()",
|
5840 |
+
"original_lang_name": "tibetan",
|
5841 |
+
"original_lang_code": "bod",
|
5842 |
+
"script": "Tibt",
|
5843 |
+
"class_name": "TibetanTokenizer"
|
5844 |
+
}
|
5845 |
+
},
|
5846 |
"node_i": "9364",
|
5847 |
"native_tokenizers": [],
|
5848 |
"scripts": []
|
|
|
5850 |
],
|
5851 |
"family": "Sino-Tibetan",
|
5852 |
"tokenizers": {
|
5853 |
+
"Tibt": {
|
5854 |
+
"full_object": "TibetanTokenizer()",
|
5855 |
+
"original_lang_name": "tibetan",
|
5856 |
+
"original_lang_code": "bod",
|
5857 |
+
"script": "Tibt",
|
5858 |
+
"class_name": "TibetanTokenizer"
|
5859 |
+
},
|
5860 |
"Deva": {
|
5861 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
5862 |
"original_lang_name": "bodo",
|
|
|
6522 |
],
|
6523 |
"family": "Sino-Tibetan",
|
6524 |
"tokenizers": {
|
6525 |
+
"Tibt": {
|
6526 |
+
"full_object": "TibetanTokenizer()",
|
6527 |
+
"original_lang_name": "tibetan",
|
6528 |
+
"original_lang_code": "bod",
|
6529 |
+
"script": "Tibt",
|
6530 |
+
"class_name": "TibetanTokenizer"
|
6531 |
+
},
|
6532 |
"Deva": {
|
6533 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
6534 |
"original_lang_name": "bodo",
|
|
|
7375 |
],
|
7376 |
"family": "Sino-Tibetan",
|
7377 |
"tokenizers": {
|
7378 |
+
"Tibt": {
|
7379 |
+
"full_object": "TibetanTokenizer()",
|
7380 |
+
"original_lang_name": "tibetan",
|
7381 |
+
"original_lang_code": "bod",
|
7382 |
+
"script": "Tibt",
|
7383 |
+
"class_name": "TibetanTokenizer"
|
7384 |
+
},
|
7385 |
"Deva": {
|
7386 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
7387 |
"original_lang_name": "bodo",
|
|
|
7403 |
"original_lang_code": "brx",
|
7404 |
"script": "Deva",
|
7405 |
"class_name": "IndicNLPTokenizer"
|
7406 |
+
},
|
7407 |
+
"Tibt": {
|
7408 |
+
"full_object": "TibetanTokenizer()",
|
7409 |
+
"original_lang_name": "tibetan",
|
7410 |
+
"original_lang_code": "bod",
|
7411 |
+
"script": "Tibt",
|
7412 |
+
"class_name": "TibetanTokenizer"
|
7413 |
}
|
7414 |
},
|
7415 |
"node_i": "8937",
|