guipenedo HF staff commited on
Commit
baa687b
·
unverified ·
1 Parent(s): 49dc1e7

added khmer, tibetan and lao

Browse files
data/Austro-Asiatic.json CHANGED
@@ -813,6 +813,13 @@
813
  "children": [],
814
  "family": "Austro-Asiatic",
815
  "tokenizers": {
 
 
 
 
 
 
 
816
  "Latn": {
817
  "full_object": "SpaCyTokenizer(\"vi\")",
818
  "original_lang_name": "vietnamese",
@@ -855,6 +862,13 @@
855
  ],
856
  "family": "Austro-Asiatic",
857
  "tokenizers": {
 
 
 
 
 
 
 
858
  "Latn": {
859
  "full_object": "SpaCyTokenizer(\"vi\")",
860
  "original_lang_name": "vietnamese",
@@ -870,6 +884,13 @@
870
  ],
871
  "family": "Austro-Asiatic",
872
  "tokenizers": {
 
 
 
 
 
 
 
873
  "Latn": {
874
  "full_object": "SpaCyTokenizer(\"vi\")",
875
  "original_lang_name": "vietnamese",
@@ -922,6 +943,13 @@
922
  ],
923
  "family": "Austro-Asiatic",
924
  "tokenizers": {
 
 
 
 
 
 
 
925
  "Latn": {
926
  "full_object": "SpaCyTokenizer(\"vi\")",
927
  "original_lang_name": "vietnamese",
@@ -986,6 +1014,13 @@
986
  ],
987
  "family": "Austro-Asiatic",
988
  "tokenizers": {
 
 
 
 
 
 
 
989
  "Latn": {
990
  "full_object": "SpaCyTokenizer(\"vi\")",
991
  "original_lang_name": "vietnamese",
@@ -1174,6 +1209,13 @@
1174
  ],
1175
  "family": "Austro-Asiatic",
1176
  "tokenizers": {
 
 
 
 
 
 
 
1177
  "Latn": {
1178
  "full_object": "SpaCyTokenizer(\"vi\")",
1179
  "original_lang_name": "vietnamese",
@@ -1568,9 +1610,19 @@
1568
  "iso_3_code": "khm",
1569
  "children": [],
1570
  "family": "Austro-Asiatic",
1571
- "tokenizers": {},
 
 
 
 
 
 
 
 
1572
  "node_i": "1283",
1573
- "native_tokenizers": [],
 
 
1574
  "scripts": [
1575
  "Khmr"
1576
  ]
@@ -1591,7 +1643,15 @@
1591
  }
1592
  ],
1593
  "family": "Austro-Asiatic",
1594
- "tokenizers": {},
 
 
 
 
 
 
 
 
1595
  "node_i": "1282",
1596
  "native_tokenizers": [],
1597
  "scripts": []
@@ -1749,6 +1809,13 @@
1749
  ],
1750
  "family": "Austro-Asiatic",
1751
  "tokenizers": {
 
 
 
 
 
 
 
1752
  "Latn": {
1753
  "full_object": "SpaCyTokenizer(\"vi\")",
1754
  "original_lang_name": "vietnamese",
@@ -3141,6 +3208,13 @@
3141
  ],
3142
  "family": "Austro-Asiatic",
3143
  "tokenizers": {
 
 
 
 
 
 
 
3144
  "Latn": {
3145
  "full_object": "SpaCyTokenizer(\"vi\")",
3146
  "original_lang_name": "vietnamese",
 
813
  "children": [],
814
  "family": "Austro-Asiatic",
815
  "tokenizers": {
816
+ "Khmr": {
817
+ "full_object": "KhmerTokenizer()",
818
+ "original_lang_name": "khmer",
819
+ "original_lang_code": "khm",
820
+ "script": "Khmr",
821
+ "class_name": "KhmerTokenizer"
822
+ },
823
  "Latn": {
824
  "full_object": "SpaCyTokenizer(\"vi\")",
825
  "original_lang_name": "vietnamese",
 
862
  ],
863
  "family": "Austro-Asiatic",
864
  "tokenizers": {
865
+ "Khmr": {
866
+ "full_object": "KhmerTokenizer()",
867
+ "original_lang_name": "khmer",
868
+ "original_lang_code": "khm",
869
+ "script": "Khmr",
870
+ "class_name": "KhmerTokenizer"
871
+ },
872
  "Latn": {
873
  "full_object": "SpaCyTokenizer(\"vi\")",
874
  "original_lang_name": "vietnamese",
 
884
  ],
885
  "family": "Austro-Asiatic",
886
  "tokenizers": {
887
+ "Khmr": {
888
+ "full_object": "KhmerTokenizer()",
889
+ "original_lang_name": "khmer",
890
+ "original_lang_code": "khm",
891
+ "script": "Khmr",
892
+ "class_name": "KhmerTokenizer"
893
+ },
894
  "Latn": {
895
  "full_object": "SpaCyTokenizer(\"vi\")",
896
  "original_lang_name": "vietnamese",
 
943
  ],
944
  "family": "Austro-Asiatic",
945
  "tokenizers": {
946
+ "Khmr": {
947
+ "full_object": "KhmerTokenizer()",
948
+ "original_lang_name": "khmer",
949
+ "original_lang_code": "khm",
950
+ "script": "Khmr",
951
+ "class_name": "KhmerTokenizer"
952
+ },
953
  "Latn": {
954
  "full_object": "SpaCyTokenizer(\"vi\")",
955
  "original_lang_name": "vietnamese",
 
1014
  ],
1015
  "family": "Austro-Asiatic",
1016
  "tokenizers": {
1017
+ "Khmr": {
1018
+ "full_object": "KhmerTokenizer()",
1019
+ "original_lang_name": "khmer",
1020
+ "original_lang_code": "khm",
1021
+ "script": "Khmr",
1022
+ "class_name": "KhmerTokenizer"
1023
+ },
1024
  "Latn": {
1025
  "full_object": "SpaCyTokenizer(\"vi\")",
1026
  "original_lang_name": "vietnamese",
 
1209
  ],
1210
  "family": "Austro-Asiatic",
1211
  "tokenizers": {
1212
+ "Khmr": {
1213
+ "full_object": "KhmerTokenizer()",
1214
+ "original_lang_name": "khmer",
1215
+ "original_lang_code": "khm",
1216
+ "script": "Khmr",
1217
+ "class_name": "KhmerTokenizer"
1218
+ },
1219
  "Latn": {
1220
  "full_object": "SpaCyTokenizer(\"vi\")",
1221
  "original_lang_name": "vietnamese",
 
1610
  "iso_3_code": "khm",
1611
  "children": [],
1612
  "family": "Austro-Asiatic",
1613
+ "tokenizers": {
1614
+ "Khmr": {
1615
+ "full_object": "KhmerTokenizer()",
1616
+ "original_lang_name": "khmer",
1617
+ "original_lang_code": "khm",
1618
+ "script": "Khmr",
1619
+ "class_name": "KhmerTokenizer"
1620
+ }
1621
+ },
1622
  "node_i": "1283",
1623
+ "native_tokenizers": [
1624
+ "Khmr"
1625
+ ],
1626
  "scripts": [
1627
  "Khmr"
1628
  ]
 
1643
  }
1644
  ],
1645
  "family": "Austro-Asiatic",
1646
+ "tokenizers": {
1647
+ "Khmr": {
1648
+ "full_object": "KhmerTokenizer()",
1649
+ "original_lang_name": "khmer",
1650
+ "original_lang_code": "khm",
1651
+ "script": "Khmr",
1652
+ "class_name": "KhmerTokenizer"
1653
+ }
1654
+ },
1655
  "node_i": "1282",
1656
  "native_tokenizers": [],
1657
  "scripts": []
 
1809
  ],
1810
  "family": "Austro-Asiatic",
1811
  "tokenizers": {
1812
+ "Khmr": {
1813
+ "full_object": "KhmerTokenizer()",
1814
+ "original_lang_name": "khmer",
1815
+ "original_lang_code": "khm",
1816
+ "script": "Khmr",
1817
+ "class_name": "KhmerTokenizer"
1818
+ },
1819
  "Latn": {
1820
  "full_object": "SpaCyTokenizer(\"vi\")",
1821
  "original_lang_name": "vietnamese",
 
3208
  ],
3209
  "family": "Austro-Asiatic",
3210
  "tokenizers": {
3211
+ "Khmr": {
3212
+ "full_object": "KhmerTokenizer()",
3213
+ "original_lang_name": "khmer",
3214
+ "original_lang_code": "khm",
3215
+ "script": "Khmr",
3216
+ "class_name": "KhmerTokenizer"
3217
+ },
3218
  "Latn": {
3219
  "full_object": "SpaCyTokenizer(\"vi\")",
3220
  "original_lang_name": "vietnamese",
data/Kra-Dai.json CHANGED
@@ -722,9 +722,19 @@
722
  "iso_3_code": "lao",
723
  "children": [],
724
  "family": "Kra-Dai",
725
- "tokenizers": {},
 
 
 
 
 
 
 
 
726
  "node_i": "4725",
727
- "native_tokenizers": [],
 
 
728
  "scripts": [
729
  "Laoo"
730
  ]
@@ -1020,6 +1030,13 @@
1020
  ],
1021
  "family": "Kra-Dai",
1022
  "tokenizers": {
 
 
 
 
 
 
 
1023
  "Thai": {
1024
  "full_object": "ThaiTokenizer()",
1025
  "original_lang_name": "thai",
@@ -1035,6 +1052,13 @@
1035
  ],
1036
  "family": "Kra-Dai",
1037
  "tokenizers": {
 
 
 
 
 
 
 
1038
  "Thai": {
1039
  "full_object": "ThaiTokenizer()",
1040
  "original_lang_name": "thai",
@@ -1050,6 +1074,13 @@
1050
  ],
1051
  "family": "Kra-Dai",
1052
  "tokenizers": {
 
 
 
 
 
 
 
1053
  "Thai": {
1054
  "full_object": "ThaiTokenizer()",
1055
  "original_lang_name": "thai",
 
722
  "iso_3_code": "lao",
723
  "children": [],
724
  "family": "Kra-Dai",
725
+ "tokenizers": {
726
+ "Laoo": {
727
+ "full_object": "LaoTokenizer()",
728
+ "original_lang_name": "lao",
729
+ "original_lang_code": "lao",
730
+ "script": "Laoo",
731
+ "class_name": "LaoTokenizer"
732
+ }
733
+ },
734
  "node_i": "4725",
735
+ "native_tokenizers": [
736
+ "Laoo"
737
+ ],
738
  "scripts": [
739
  "Laoo"
740
  ]
 
1030
  ],
1031
  "family": "Kra-Dai",
1032
  "tokenizers": {
1033
+ "Laoo": {
1034
+ "full_object": "LaoTokenizer()",
1035
+ "original_lang_name": "lao",
1036
+ "original_lang_code": "lao",
1037
+ "script": "Laoo",
1038
+ "class_name": "LaoTokenizer"
1039
+ },
1040
  "Thai": {
1041
  "full_object": "ThaiTokenizer()",
1042
  "original_lang_name": "thai",
 
1052
  ],
1053
  "family": "Kra-Dai",
1054
  "tokenizers": {
1055
+ "Laoo": {
1056
+ "full_object": "LaoTokenizer()",
1057
+ "original_lang_name": "lao",
1058
+ "original_lang_code": "lao",
1059
+ "script": "Laoo",
1060
+ "class_name": "LaoTokenizer"
1061
+ },
1062
  "Thai": {
1063
  "full_object": "ThaiTokenizer()",
1064
  "original_lang_name": "thai",
 
1074
  ],
1075
  "family": "Kra-Dai",
1076
  "tokenizers": {
1077
+ "Laoo": {
1078
+ "full_object": "LaoTokenizer()",
1079
+ "original_lang_name": "lao",
1080
+ "original_lang_code": "lao",
1081
+ "script": "Laoo",
1082
+ "class_name": "LaoTokenizer"
1083
+ },
1084
  "Thai": {
1085
  "full_object": "ThaiTokenizer()",
1086
  "original_lang_name": "thai",
data/Sino-Tibetan.json CHANGED
@@ -5266,9 +5266,19 @@
5266
  "iso_3_code": "bod",
5267
  "children": [],
5268
  "family": "Sino-Tibetan",
5269
- "tokenizers": {},
 
 
 
 
 
 
 
 
5270
  "node_i": "9329",
5271
- "native_tokenizers": [],
 
 
5272
  "scripts": [
5273
  "Tibt"
5274
  ]
@@ -5509,9 +5519,19 @@
5509
  "iso_3_code": "dzo",
5510
  "children": [],
5511
  "family": "Sino-Tibetan",
5512
- "tokenizers": {},
 
 
 
 
 
 
 
 
5513
  "node_i": "9348",
5514
- "native_tokenizers": [],
 
 
5515
  "scripts": [
5516
  "Tibt"
5517
  ]
@@ -5590,7 +5610,15 @@
5590
  }
5591
  ],
5592
  "family": "Sino-Tibetan",
5593
- "tokenizers": {},
 
 
 
 
 
 
 
 
5594
  "node_i": "9345",
5595
  "native_tokenizers": [],
5596
  "scripts": []
@@ -5647,6 +5675,13 @@
5647
  ],
5648
  "family": "Sino-Tibetan",
5649
  "tokenizers": {
 
 
 
 
 
 
 
5650
  "Deva": {
5651
  "full_object": "IndicNLPTokenizer(\"hi\")",
5652
  "original_lang_name": "bodo",
@@ -5758,7 +5793,15 @@
5758
  "iso_3_code": "lbj",
5759
  "children": [],
5760
  "family": "Sino-Tibetan",
5761
- "tokenizers": {},
 
 
 
 
 
 
 
 
5762
  "node_i": "9367",
5763
  "native_tokenizers": [],
5764
  "scripts": [
@@ -5791,7 +5834,15 @@
5791
  }
5792
  ],
5793
  "family": "Sino-Tibetan",
5794
- "tokenizers": {},
 
 
 
 
 
 
 
 
5795
  "node_i": "9364",
5796
  "native_tokenizers": [],
5797
  "scripts": []
@@ -5799,6 +5850,13 @@
5799
  ],
5800
  "family": "Sino-Tibetan",
5801
  "tokenizers": {
 
 
 
 
 
 
 
5802
  "Deva": {
5803
  "full_object": "IndicNLPTokenizer(\"hi\")",
5804
  "original_lang_name": "bodo",
@@ -6464,6 +6522,13 @@
6464
  ],
6465
  "family": "Sino-Tibetan",
6466
  "tokenizers": {
 
 
 
 
 
 
 
6467
  "Deva": {
6468
  "full_object": "IndicNLPTokenizer(\"hi\")",
6469
  "original_lang_name": "bodo",
@@ -7310,6 +7375,13 @@
7310
  ],
7311
  "family": "Sino-Tibetan",
7312
  "tokenizers": {
 
 
 
 
 
 
 
7313
  "Deva": {
7314
  "full_object": "IndicNLPTokenizer(\"hi\")",
7315
  "original_lang_name": "bodo",
@@ -7331,6 +7403,13 @@
7331
  "original_lang_code": "brx",
7332
  "script": "Deva",
7333
  "class_name": "IndicNLPTokenizer"
 
 
 
 
 
 
 
7334
  }
7335
  },
7336
  "node_i": "8937",
 
5266
  "iso_3_code": "bod",
5267
  "children": [],
5268
  "family": "Sino-Tibetan",
5269
+ "tokenizers": {
5270
+ "Tibt": {
5271
+ "full_object": "TibetanTokenizer()",
5272
+ "original_lang_name": "tibetan",
5273
+ "original_lang_code": "bod",
5274
+ "script": "Tibt",
5275
+ "class_name": "TibetanTokenizer"
5276
+ }
5277
+ },
5278
  "node_i": "9329",
5279
+ "native_tokenizers": [
5280
+ "Tibt"
5281
+ ],
5282
  "scripts": [
5283
  "Tibt"
5284
  ]
 
5519
  "iso_3_code": "dzo",
5520
  "children": [],
5521
  "family": "Sino-Tibetan",
5522
+ "tokenizers": {
5523
+ "Tibt": {
5524
+ "full_object": "TibetanTokenizer()",
5525
+ "original_lang_name": "dzongkha",
5526
+ "original_lang_code": "dzo",
5527
+ "script": "Tibt",
5528
+ "class_name": "TibetanTokenizer"
5529
+ }
5530
+ },
5531
  "node_i": "9348",
5532
+ "native_tokenizers": [
5533
+ "Tibt"
5534
+ ],
5535
  "scripts": [
5536
  "Tibt"
5537
  ]
 
5610
  }
5611
  ],
5612
  "family": "Sino-Tibetan",
5613
+ "tokenizers": {
5614
+ "Tibt": {
5615
+ "full_object": "TibetanTokenizer()",
5616
+ "original_lang_name": "dzongkha",
5617
+ "original_lang_code": "dzo",
5618
+ "script": "Tibt",
5619
+ "class_name": "TibetanTokenizer"
5620
+ }
5621
+ },
5622
  "node_i": "9345",
5623
  "native_tokenizers": [],
5624
  "scripts": []
 
5675
  ],
5676
  "family": "Sino-Tibetan",
5677
  "tokenizers": {
5678
+ "Tibt": {
5679
+ "full_object": "TibetanTokenizer()",
5680
+ "original_lang_name": "tibetan",
5681
+ "original_lang_code": "bod",
5682
+ "script": "Tibt",
5683
+ "class_name": "TibetanTokenizer"
5684
+ },
5685
  "Deva": {
5686
  "full_object": "IndicNLPTokenizer(\"hi\")",
5687
  "original_lang_name": "bodo",
 
5793
  "iso_3_code": "lbj",
5794
  "children": [],
5795
  "family": "Sino-Tibetan",
5796
+ "tokenizers": {
5797
+ "Tibt": {
5798
+ "full_object": "TibetanTokenizer()",
5799
+ "original_lang_name": "tibetan",
5800
+ "original_lang_code": "bod",
5801
+ "script": "Tibt",
5802
+ "class_name": "TibetanTokenizer"
5803
+ }
5804
+ },
5805
  "node_i": "9367",
5806
  "native_tokenizers": [],
5807
  "scripts": [
 
5834
  }
5835
  ],
5836
  "family": "Sino-Tibetan",
5837
+ "tokenizers": {
5838
+ "Tibt": {
5839
+ "full_object": "TibetanTokenizer()",
5840
+ "original_lang_name": "tibetan",
5841
+ "original_lang_code": "bod",
5842
+ "script": "Tibt",
5843
+ "class_name": "TibetanTokenizer"
5844
+ }
5845
+ },
5846
  "node_i": "9364",
5847
  "native_tokenizers": [],
5848
  "scripts": []
 
5850
  ],
5851
  "family": "Sino-Tibetan",
5852
  "tokenizers": {
5853
+ "Tibt": {
5854
+ "full_object": "TibetanTokenizer()",
5855
+ "original_lang_name": "tibetan",
5856
+ "original_lang_code": "bod",
5857
+ "script": "Tibt",
5858
+ "class_name": "TibetanTokenizer"
5859
+ },
5860
  "Deva": {
5861
  "full_object": "IndicNLPTokenizer(\"hi\")",
5862
  "original_lang_name": "bodo",
 
6522
  ],
6523
  "family": "Sino-Tibetan",
6524
  "tokenizers": {
6525
+ "Tibt": {
6526
+ "full_object": "TibetanTokenizer()",
6527
+ "original_lang_name": "tibetan",
6528
+ "original_lang_code": "bod",
6529
+ "script": "Tibt",
6530
+ "class_name": "TibetanTokenizer"
6531
+ },
6532
  "Deva": {
6533
  "full_object": "IndicNLPTokenizer(\"hi\")",
6534
  "original_lang_name": "bodo",
 
7375
  ],
7376
  "family": "Sino-Tibetan",
7377
  "tokenizers": {
7378
+ "Tibt": {
7379
+ "full_object": "TibetanTokenizer()",
7380
+ "original_lang_name": "tibetan",
7381
+ "original_lang_code": "bod",
7382
+ "script": "Tibt",
7383
+ "class_name": "TibetanTokenizer"
7384
+ },
7385
  "Deva": {
7386
  "full_object": "IndicNLPTokenizer(\"hi\")",
7387
  "original_lang_name": "bodo",
 
7403
  "original_lang_code": "brx",
7404
  "script": "Deva",
7405
  "class_name": "IndicNLPTokenizer"
7406
+ },
7407
+ "Tibt": {
7408
+ "full_object": "TibetanTokenizer()",
7409
+ "original_lang_name": "tibetan",
7410
+ "original_lang_code": "bod",
7411
+ "script": "Tibt",
7412
+ "class_name": "TibetanTokenizer"
7413
  }
7414
  },
7415
  "node_i": "8937",