Spaces:
Running
Running
theo
commited on
Commit
·
827b7ef
1
Parent(s):
1cc3978
validate against langcodes via textarea, better error display
Browse files- language_set.json +0 -478
- tagging_app.py +56 -48
language_set.json
DELETED
|
@@ -1,478 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"code": "Programming language code",
|
| 3 |
-
"aa": "Afar",
|
| 4 |
-
"ab": "Abkhazian",
|
| 5 |
-
"ace": "Achinese",
|
| 6 |
-
"ach": "Acoli",
|
| 7 |
-
"ada": "Adangme",
|
| 8 |
-
"ady": "Adyghe, Adygei",
|
| 9 |
-
"ae": "Avestan",
|
| 10 |
-
"af": "Afrikaans",
|
| 11 |
-
"afa": "Afro-Asiatic languages",
|
| 12 |
-
"afh": "Afrihili",
|
| 13 |
-
"ain": "Ainu (Japan)",
|
| 14 |
-
"ak": "Akan",
|
| 15 |
-
"akk": "Akkadian",
|
| 16 |
-
"ale": "Aleut",
|
| 17 |
-
"alg": "Algonquian languages",
|
| 18 |
-
"alt": "Southern Altai",
|
| 19 |
-
"am": "Amharic",
|
| 20 |
-
"an": "Aragonese",
|
| 21 |
-
"ang": "Old English (ca. 450-1100)",
|
| 22 |
-
"apa": "Apache languages",
|
| 23 |
-
"ar": "Arabic",
|
| 24 |
-
"arc": "Official Aramaic (700-300 BCE), Imperial Aramaic (700-300 BCE)",
|
| 25 |
-
"arn": "Mapudungun, Mapuche",
|
| 26 |
-
"arp": "Arapaho",
|
| 27 |
-
"art": "Artificial languages",
|
| 28 |
-
"arw": "Arawak",
|
| 29 |
-
"as": "Assamese",
|
| 30 |
-
"ast": "Asturian, Asturleonese, Bable, Leonese",
|
| 31 |
-
"ath": "Athapascan languages",
|
| 32 |
-
"aus": "Australian languages",
|
| 33 |
-
"av": "Avaric",
|
| 34 |
-
"awa": "Awadhi",
|
| 35 |
-
"ay": "Aymara",
|
| 36 |
-
"az": "Azerbaijani",
|
| 37 |
-
"ba": "Bashkir",
|
| 38 |
-
"bad": "Banda languages",
|
| 39 |
-
"bai": "Bamileke languages",
|
| 40 |
-
"bal": "Baluchi",
|
| 41 |
-
"ban": "Balinese",
|
| 42 |
-
"bas": "Basa (Cameroon)",
|
| 43 |
-
"bat": "Baltic languages",
|
| 44 |
-
"be": "Belarusian",
|
| 45 |
-
"bej": "Beja, Bedawiyet",
|
| 46 |
-
"bem": "Bemba (Zambia)",
|
| 47 |
-
"ber": "Berber languages",
|
| 48 |
-
"bg": "Bulgarian",
|
| 49 |
-
"bh": "Bihari languages",
|
| 50 |
-
"bho": "Bhojpuri",
|
| 51 |
-
"bi": "Bislama",
|
| 52 |
-
"bik": "Bikol",
|
| 53 |
-
"bin": "Bini, Edo",
|
| 54 |
-
"bla": "Siksika",
|
| 55 |
-
"bm": "Bambara",
|
| 56 |
-
"bn": "Bengali, Bangla",
|
| 57 |
-
"bnt": "Bantu languages",
|
| 58 |
-
"bo": "Tibetan",
|
| 59 |
-
"br": "Breton",
|
| 60 |
-
"bra": "Braj",
|
| 61 |
-
"bs": "Bosnian",
|
| 62 |
-
"btk": "Batak languages",
|
| 63 |
-
"bua": "Buriat",
|
| 64 |
-
"bug": "Buginese",
|
| 65 |
-
"byn": "Bilin, Blin",
|
| 66 |
-
"ca": "Catalan, Valencian",
|
| 67 |
-
"cad": "Caddo",
|
| 68 |
-
"cai": "Central American Indian languages",
|
| 69 |
-
"car": "Galibi Carib",
|
| 70 |
-
"cau": "Caucasian languages",
|
| 71 |
-
"ce": "Chechen",
|
| 72 |
-
"ceb": "Cebuano",
|
| 73 |
-
"cel": "Celtic languages",
|
| 74 |
-
"ch": "Chamorro",
|
| 75 |
-
"chb": "Chibcha",
|
| 76 |
-
"chg": "Chagatai",
|
| 77 |
-
"chk": "Chuukese",
|
| 78 |
-
"chm": "Mari (Russia)",
|
| 79 |
-
"chn": "Chinook jargon",
|
| 80 |
-
"cho": "Choctaw",
|
| 81 |
-
"chp": "Chipewyan, Dene Suline",
|
| 82 |
-
"chr": "Cherokee",
|
| 83 |
-
"chy": "Cheyenne",
|
| 84 |
-
"cmc": "Chamic languages",
|
| 85 |
-
"co": "Corsican",
|
| 86 |
-
"cop": "Coptic",
|
| 87 |
-
"cpe": "English-based creoles and pidgins",
|
| 88 |
-
"cpf": "French-based creoles and pidgins",
|
| 89 |
-
"cpp": "Portuguese-based creoles and pidgins",
|
| 90 |
-
"cr": "Cree",
|
| 91 |
-
"crh": "Crimean Tatar, Crimean Turkish",
|
| 92 |
-
"crp": "Creoles and pidgins",
|
| 93 |
-
"cs": "Czech",
|
| 94 |
-
"csb": "Kashubian",
|
| 95 |
-
"cu": "Church Slavic, Church Slavonic, Old Bulgarian, Old Church Slavonic, Old Slavonic",
|
| 96 |
-
"cus": "Cushitic languages",
|
| 97 |
-
"cv": "Chuvash",
|
| 98 |
-
"cy": "Welsh",
|
| 99 |
-
"da": "Danish",
|
| 100 |
-
"dak": "Dakota",
|
| 101 |
-
"dar": "Dargwa",
|
| 102 |
-
"day": "Land Dayak languages",
|
| 103 |
-
"de": "German",
|
| 104 |
-
"del": "Delaware",
|
| 105 |
-
"den": "Slave (Athapascan)",
|
| 106 |
-
"dgr": "Dogrib, T\u0142\u0131\u0328ch\u01eb",
|
| 107 |
-
"din": "Dinka",
|
| 108 |
-
"doi": "Dogri (macrolanguage)",
|
| 109 |
-
"dra": "Dravidian languages",
|
| 110 |
-
"dsb": "Lower Sorbian",
|
| 111 |
-
"dua": "Duala",
|
| 112 |
-
"dum": "Middle Dutch (ca. 1050-1350)",
|
| 113 |
-
"dv": "Dhivehi, Divehi, Maldivian",
|
| 114 |
-
"dyu": "Dyula",
|
| 115 |
-
"dz": "Dzongkha",
|
| 116 |
-
"ee": "Ewe",
|
| 117 |
-
"efi": "Efik",
|
| 118 |
-
"egy": "Egyptian (Ancient)",
|
| 119 |
-
"eka": "Ekajuk",
|
| 120 |
-
"el": "Modern Greek (1453-)",
|
| 121 |
-
"elx": "Elamite",
|
| 122 |
-
"en": "English",
|
| 123 |
-
"enm": "Middle English (1100-1500)",
|
| 124 |
-
"eo": "Esperanto",
|
| 125 |
-
"es": "Spanish, Castilian",
|
| 126 |
-
"et": "Estonian",
|
| 127 |
-
"eu": "Basque",
|
| 128 |
-
"ewo": "Ewondo",
|
| 129 |
-
"fa": "Persian",
|
| 130 |
-
"fan": "Fang (Equatorial Guinea)",
|
| 131 |
-
"fat": "Fanti",
|
| 132 |
-
"ff": "Fulah",
|
| 133 |
-
"fi": "Finnish",
|
| 134 |
-
"fil": "Filipino, Pilipino",
|
| 135 |
-
"fiu": "Finno-Ugrian languages",
|
| 136 |
-
"fj": "Fijian",
|
| 137 |
-
"fo": "Faroese",
|
| 138 |
-
"fon": "Fon",
|
| 139 |
-
"fr": "French",
|
| 140 |
-
"frm": "Middle French (ca. 1400-1600)",
|
| 141 |
-
"fro": "Old French (842-ca. 1400)",
|
| 142 |
-
"fur": "Friulian",
|
| 143 |
-
"fy": "Western Frisian",
|
| 144 |
-
"ga": "Irish",
|
| 145 |
-
"gaa": "Ga",
|
| 146 |
-
"gay": "Gayo",
|
| 147 |
-
"gba": "Gbaya (Central African Republic)",
|
| 148 |
-
"gd": "Scottish Gaelic, Gaelic",
|
| 149 |
-
"gem": "Germanic languages",
|
| 150 |
-
"gez": "Geez",
|
| 151 |
-
"gil": "Gilbertese",
|
| 152 |
-
"gl": "Galician",
|
| 153 |
-
"gmh": "Middle High German (ca. 1050-1500)",
|
| 154 |
-
"gn": "Guarani",
|
| 155 |
-
"goh": "Old High German (ca. 750-1050)",
|
| 156 |
-
"gon": "Gondi",
|
| 157 |
-
"gor": "Gorontalo",
|
| 158 |
-
"got": "Gothic",
|
| 159 |
-
"grb": "Grebo",
|
| 160 |
-
"grc": "Ancient Greek (to 1453)",
|
| 161 |
-
"gu": "Gujarati",
|
| 162 |
-
"gv": "Manx",
|
| 163 |
-
"gwi": "Gwich\u02bcin",
|
| 164 |
-
"ha": "Hausa",
|
| 165 |
-
"hai": "Haida",
|
| 166 |
-
"haw": "Hawaiian",
|
| 167 |
-
"he": "Hebrew",
|
| 168 |
-
"hi": "Hindi",
|
| 169 |
-
"hil": "Hiligaynon",
|
| 170 |
-
"him": "Himachali languages, Western Pahari languages",
|
| 171 |
-
"hit": "Hittite",
|
| 172 |
-
"hmn": "Hmong, Mong",
|
| 173 |
-
"ho": "Hiri Motu",
|
| 174 |
-
"hr": "Croatian",
|
| 175 |
-
"hsb": "Upper Sorbian",
|
| 176 |
-
"ht": "Haitian, Haitian Creole",
|
| 177 |
-
"hu": "Hungarian",
|
| 178 |
-
"hup": "Hupa",
|
| 179 |
-
"hy": "Armenian",
|
| 180 |
-
"hz": "Herero",
|
| 181 |
-
"ia": "Interlingua (International Auxiliary Language Association)",
|
| 182 |
-
"iba": "Iban",
|
| 183 |
-
"id": "Indonesian",
|
| 184 |
-
"ie": "Interlingue, Occidental",
|
| 185 |
-
"ig": "Igbo",
|
| 186 |
-
"ii": "Sichuan Yi, Nuosu",
|
| 187 |
-
"ijo": "Ijo languages",
|
| 188 |
-
"ik": "Inupiaq",
|
| 189 |
-
"ilo": "Iloko",
|
| 190 |
-
"inc": "Indic languages",
|
| 191 |
-
"ine": "Indo-European languages",
|
| 192 |
-
"inh": "Ingush",
|
| 193 |
-
"io": "Ido",
|
| 194 |
-
"ira": "Iranian languages",
|
| 195 |
-
"iro": "Iroquoian languages",
|
| 196 |
-
"is": "Icelandic",
|
| 197 |
-
"it": "Italian",
|
| 198 |
-
"iu": "Inuktitut",
|
| 199 |
-
"ja": "Japanese",
|
| 200 |
-
"jbo": "Lojban",
|
| 201 |
-
"jpr": "Judeo-Persian",
|
| 202 |
-
"jrb": "Judeo-Arabic",
|
| 203 |
-
"jv": "Javanese",
|
| 204 |
-
"ka": "Georgian",
|
| 205 |
-
"kaa": "Kara-Kalpak, Karakalpak",
|
| 206 |
-
"kab": "Kabyle",
|
| 207 |
-
"kac": "Kachin, Jingpho",
|
| 208 |
-
"kam": "Kamba (Kenya)",
|
| 209 |
-
"kar": "Karen languages",
|
| 210 |
-
"kaw": "Kawi",
|
| 211 |
-
"kbd": "Kabardian",
|
| 212 |
-
"kg": "Kongo",
|
| 213 |
-
"kha": "Khasi",
|
| 214 |
-
"khi": "Khoisan languages",
|
| 215 |
-
"kho": "Khotanese, Sakan",
|
| 216 |
-
"ki": "Kikuyu, Gikuyu",
|
| 217 |
-
"kj": "Kuanyama, Kwanyama",
|
| 218 |
-
"kk": "Kazakh",
|
| 219 |
-
"kl": "Kalaallisut, Greenlandic",
|
| 220 |
-
"km": "Khmer, Central Khmer",
|
| 221 |
-
"kmb": "Kimbundu",
|
| 222 |
-
"kn": "Kannada",
|
| 223 |
-
"ko": "Korean",
|
| 224 |
-
"kok": "Konkani (macrolanguage)",
|
| 225 |
-
"kos": "Kosraean",
|
| 226 |
-
"kpe": "Kpelle",
|
| 227 |
-
"kr": "Kanuri",
|
| 228 |
-
"krc": "Karachay-Balkar",
|
| 229 |
-
"kro": "Kru languages",
|
| 230 |
-
"kru": "Kurukh",
|
| 231 |
-
"ks": "Kashmiri",
|
| 232 |
-
"ku": "Kurdish",
|
| 233 |
-
"kum": "Kumyk",
|
| 234 |
-
"kut": "Kutenai",
|
| 235 |
-
"kv": "Komi",
|
| 236 |
-
"kw": "Cornish",
|
| 237 |
-
"ky": "Kirghiz, Kyrgyz",
|
| 238 |
-
"la": "Latin",
|
| 239 |
-
"lad": "Ladino",
|
| 240 |
-
"lah": "Lahnda",
|
| 241 |
-
"lam": "Lamba",
|
| 242 |
-
"lb": "Luxembourgish, Letzeburgesch",
|
| 243 |
-
"lez": "Lezghian",
|
| 244 |
-
"lg": "Ganda, Luganda",
|
| 245 |
-
"li": "Limburgan, Limburger, Limburgish",
|
| 246 |
-
"ln": "Lingala",
|
| 247 |
-
"lo": "Lao",
|
| 248 |
-
"lol": "Mongo",
|
| 249 |
-
"loz": "Lozi",
|
| 250 |
-
"lt": "Lithuanian",
|
| 251 |
-
"lu": "Luba-Katanga",
|
| 252 |
-
"lua": "Luba-Lulua",
|
| 253 |
-
"lui": "Luiseno",
|
| 254 |
-
"lun": "Lunda",
|
| 255 |
-
"luo": "Luo (Kenya and Tanzania), Dholuo",
|
| 256 |
-
"lus": "Lushai",
|
| 257 |
-
"lv": "Latvian",
|
| 258 |
-
"mad": "Madurese",
|
| 259 |
-
"mag": "Magahi",
|
| 260 |
-
"mai": "Maithili",
|
| 261 |
-
"mak": "Makasar",
|
| 262 |
-
"man": "Mandingo, Manding",
|
| 263 |
-
"map": "Austronesian languages",
|
| 264 |
-
"mas": "Masai",
|
| 265 |
-
"mdf": "Moksha",
|
| 266 |
-
"mdr": "Mandar",
|
| 267 |
-
"men": "Mende (Sierra Leone)",
|
| 268 |
-
"mg": "Malagasy",
|
| 269 |
-
"mga": "Middle Irish (900-1200)",
|
| 270 |
-
"mh": "Marshallese",
|
| 271 |
-
"mi": "Maori",
|
| 272 |
-
"mic": "Mi'kmaq, Micmac",
|
| 273 |
-
"min": "Minangkabau",
|
| 274 |
-
"mis": "Uncoded languages",
|
| 275 |
-
"mk": "Macedonian",
|
| 276 |
-
"mkh": "Mon-Khmer languages",
|
| 277 |
-
"ml": "Malayalam",
|
| 278 |
-
"mn": "Mongolian",
|
| 279 |
-
"mnc": "Manchu",
|
| 280 |
-
"mni": "Manipuri",
|
| 281 |
-
"mno": "Manobo languages",
|
| 282 |
-
"moh": "Mohawk",
|
| 283 |
-
"mos": "Mossi",
|
| 284 |
-
"mr": "Marathi",
|
| 285 |
-
"ms": "Malay (macrolanguage)",
|
| 286 |
-
"mt": "Maltese",
|
| 287 |
-
"mul": "Multiple languages",
|
| 288 |
-
"mun": "Munda languages",
|
| 289 |
-
"mus": "Creek",
|
| 290 |
-
"mwl": "Mirandese",
|
| 291 |
-
"mwr": "Marwari",
|
| 292 |
-
"my": "Burmese",
|
| 293 |
-
"myn": "Mayan languages",
|
| 294 |
-
"myv": "Erzya",
|
| 295 |
-
"na": "Nauru",
|
| 296 |
-
"nah": "Nahuatl languages",
|
| 297 |
-
"nai": "North American Indian languages",
|
| 298 |
-
"nap": "Neapolitan",
|
| 299 |
-
"nb": "Norwegian Bokm\u00e5l",
|
| 300 |
-
"nd": "North Ndebele",
|
| 301 |
-
"nds": "Low German, Low Saxon",
|
| 302 |
-
"ne": "Nepali (macrolanguage)",
|
| 303 |
-
"new": "Newari, Nepal Bhasa",
|
| 304 |
-
"ng": "Ndonga",
|
| 305 |
-
"nia": "Nias",
|
| 306 |
-
"nic": "Niger-Kordofanian languages",
|
| 307 |
-
"niu": "Niuean",
|
| 308 |
-
"nl": "Dutch, Flemish",
|
| 309 |
-
"nn": "Norwegian Nynorsk",
|
| 310 |
-
"no": "Norwegian",
|
| 311 |
-
"nog": "Nogai",
|
| 312 |
-
"non": "Old Norse",
|
| 313 |
-
"nr": "South Ndebele",
|
| 314 |
-
"nso": "Pedi, Northern Sotho, Sepedi",
|
| 315 |
-
"nub": "Nubian languages",
|
| 316 |
-
"nv": "Navajo, Navaho",
|
| 317 |
-
"nwc": "Classical Newari, Classical Nepal Bhasa, Old Newari",
|
| 318 |
-
"ny": "Nyanja, Chewa, Chichewa",
|
| 319 |
-
"nym": "Nyamwezi",
|
| 320 |
-
"nyn": "Nyankole",
|
| 321 |
-
"nyo": "Nyoro",
|
| 322 |
-
"nzi": "Nzima",
|
| 323 |
-
"oc": "Occitan (post 1500)",
|
| 324 |
-
"oj": "Ojibwa",
|
| 325 |
-
"om": "Oromo",
|
| 326 |
-
"or": "Oriya (macrolanguage), Odia (macrolanguage)",
|
| 327 |
-
"os": "Ossetian, Ossetic",
|
| 328 |
-
"osa": "Osage",
|
| 329 |
-
"ota": "Ottoman Turkish (1500-1928)",
|
| 330 |
-
"oto": "Otomian languages",
|
| 331 |
-
"pa": "Panjabi, Punjabi",
|
| 332 |
-
"paa": "Papuan languages",
|
| 333 |
-
"pag": "Pangasinan",
|
| 334 |
-
"pal": "Pahlavi",
|
| 335 |
-
"pam": "Pampanga, Kapampangan",
|
| 336 |
-
"pap": "Papiamento",
|
| 337 |
-
"pau": "Palauan",
|
| 338 |
-
"peo": "Old Persian (ca. 600-400 B.C.)",
|
| 339 |
-
"phi": "Philippine languages",
|
| 340 |
-
"phn": "Phoenician",
|
| 341 |
-
"pi": "Pali",
|
| 342 |
-
"pl": "Polish",
|
| 343 |
-
"pon": "Pohnpeian",
|
| 344 |
-
"pra": "Prakrit languages",
|
| 345 |
-
"pro": "Old Proven\u00e7al (to 1500), Old Occitan (to 1500)",
|
| 346 |
-
"ps": "Pushto, Pashto",
|
| 347 |
-
"pt": "Portuguese",
|
| 348 |
-
"qu": "Quechua",
|
| 349 |
-
"raj": "Rajasthani",
|
| 350 |
-
"rap": "Rapanui",
|
| 351 |
-
"rar": "Rarotongan, Cook Islands Maori",
|
| 352 |
-
"rm": "Romansh",
|
| 353 |
-
"rn": "Rundi",
|
| 354 |
-
"ro": "Romanian, Moldavian, Moldovan",
|
| 355 |
-
"roa": "Romance languages",
|
| 356 |
-
"rom": "Romany",
|
| 357 |
-
"ru": "Russian",
|
| 358 |
-
"rup": "Macedo-Romanian, Aromanian, Arumanian",
|
| 359 |
-
"rw": "Kinyarwanda",
|
| 360 |
-
"sa": "Sanskrit",
|
| 361 |
-
"sad": "Sandawe",
|
| 362 |
-
"sah": "Yakut",
|
| 363 |
-
"sai": "South American Indian languages",
|
| 364 |
-
"sal": "Salishan languages",
|
| 365 |
-
"sam": "Samaritan Aramaic",
|
| 366 |
-
"sas": "Sasak",
|
| 367 |
-
"sat": "Santali",
|
| 368 |
-
"sc": "Sardinian",
|
| 369 |
-
"scn": "Sicilian",
|
| 370 |
-
"sco": "Scots",
|
| 371 |
-
"sd": "Sindhi",
|
| 372 |
-
"se": "Northern Sami",
|
| 373 |
-
"sel": "Selkup",
|
| 374 |
-
"sem": "Semitic languages",
|
| 375 |
-
"sg": "Sango",
|
| 376 |
-
"sga": "Old Irish (to 900)",
|
| 377 |
-
"sgn": "Sign languages",
|
| 378 |
-
"sh": "Serbo-Croatian",
|
| 379 |
-
"shn": "Shan",
|
| 380 |
-
"si": "Sinhala, Sinhalese",
|
| 381 |
-
"sid": "Sidamo",
|
| 382 |
-
"sio": "Siouan languages",
|
| 383 |
-
"sit": "Sino-Tibetan languages",
|
| 384 |
-
"sk": "Slovak",
|
| 385 |
-
"sl": "Slovenian",
|
| 386 |
-
"sla": "Slavic languages",
|
| 387 |
-
"sm": "Samoan",
|
| 388 |
-
"sma": "Southern Sami",
|
| 389 |
-
"smi": "Sami languages",
|
| 390 |
-
"smj": "Lule Sami",
|
| 391 |
-
"smn": "Inari Sami",
|
| 392 |
-
"sms": "Skolt Sami",
|
| 393 |
-
"sn": "Shona",
|
| 394 |
-
"snk": "Soninke",
|
| 395 |
-
"so": "Somali",
|
| 396 |
-
"sog": "Sogdian",
|
| 397 |
-
"son": "Songhai languages",
|
| 398 |
-
"sq": "Albanian",
|
| 399 |
-
"sr": "Serbian",
|
| 400 |
-
"srn": "Sranan Tongo",
|
| 401 |
-
"srr": "Serer",
|
| 402 |
-
"ss": "Swati",
|
| 403 |
-
"ssa": "Nilo-Saharan languages",
|
| 404 |
-
"st": "Southern Sotho",
|
| 405 |
-
"su": "Sundanese",
|
| 406 |
-
"suk": "Sukuma",
|
| 407 |
-
"sus": "Susu",
|
| 408 |
-
"sux": "Sumerian",
|
| 409 |
-
"sv": "Swedish",
|
| 410 |
-
"sw": "Swahili (macrolanguage)",
|
| 411 |
-
"syr": "Syriac",
|
| 412 |
-
"ta": "Tamil",
|
| 413 |
-
"tai": "Tai languages",
|
| 414 |
-
"te": "Telugu",
|
| 415 |
-
"tem": "Timne",
|
| 416 |
-
"ter": "Tereno",
|
| 417 |
-
"tet": "Tetum",
|
| 418 |
-
"tg": "Tajik",
|
| 419 |
-
"th": "Thai",
|
| 420 |
-
"ti": "Tigrinya",
|
| 421 |
-
"tig": "Tigre",
|
| 422 |
-
"tiv": "Tiv",
|
| 423 |
-
"tk": "Turkmen",
|
| 424 |
-
"tkl": "Tokelau",
|
| 425 |
-
"tl": "Tagalog",
|
| 426 |
-
"tlh": "Klingon, tlhIngan Hol",
|
| 427 |
-
"tli": "Tlingit",
|
| 428 |
-
"tmh": "Tamashek",
|
| 429 |
-
"tn": "Tswana",
|
| 430 |
-
"to": "Tonga (Tonga Islands)",
|
| 431 |
-
"tog": "Tonga (Nyasa)",
|
| 432 |
-
"tpi": "Tok Pisin",
|
| 433 |
-
"tr": "Turkish",
|
| 434 |
-
"ts": "Tsonga",
|
| 435 |
-
"tsi": "Tsimshian",
|
| 436 |
-
"tt": "Tatar",
|
| 437 |
-
"tum": "Tumbuka",
|
| 438 |
-
"tup": "Tupi languages",
|
| 439 |
-
"tut": "Altaic languages",
|
| 440 |
-
"tvl": "Tuvalu",
|
| 441 |
-
"tw": "Twi",
|
| 442 |
-
"ty": "Tahitian",
|
| 443 |
-
"tyv": "Tuvinian",
|
| 444 |
-
"udm": "Udmurt",
|
| 445 |
-
"ug": "Uighur, Uyghur",
|
| 446 |
-
"uga": "Ugaritic",
|
| 447 |
-
"uk": "Ukrainian",
|
| 448 |
-
"umb": "Umbundu",
|
| 449 |
-
"und": "Undetermined",
|
| 450 |
-
"ur": "Urdu",
|
| 451 |
-
"uz": "Uzbek",
|
| 452 |
-
"vai": "Vai",
|
| 453 |
-
"ve": "Venda",
|
| 454 |
-
"vi": "Vietnamese",
|
| 455 |
-
"vo": "Volap\u00fck",
|
| 456 |
-
"vot": "Votic",
|
| 457 |
-
"wa": "Walloon",
|
| 458 |
-
"wak": "Wakashan languages",
|
| 459 |
-
"wal": "Wolaytta, Wolaitta",
|
| 460 |
-
"war": "Waray (Philippines)",
|
| 461 |
-
"was": "Washo",
|
| 462 |
-
"wen": "Sorbian languages",
|
| 463 |
-
"wo": "Wolof",
|
| 464 |
-
"xal": "Kalmyk, Oirat",
|
| 465 |
-
"xh": "Xhosa",
|
| 466 |
-
"yao": "Yao",
|
| 467 |
-
"yap": "Yapese",
|
| 468 |
-
"yi": "Yiddish",
|
| 469 |
-
"yo": "Yoruba",
|
| 470 |
-
"ypk": "Yupik languages",
|
| 471 |
-
"za": "Zhuang, Chuang",
|
| 472 |
-
"zap": "Zapotec",
|
| 473 |
-
"zen": "Zenaga",
|
| 474 |
-
"zh": "Chinese",
|
| 475 |
-
"znd": "Zande languages",
|
| 476 |
-
"zu": "Zulu",
|
| 477 |
-
"zun": "Zuni"
|
| 478 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tagging_app.py
CHANGED
|
@@ -2,9 +2,10 @@ import json
|
|
| 2 |
from pathlib import Path
|
| 3 |
from typing import Callable, Dict, List, Tuple
|
| 4 |
|
|
|
|
| 5 |
import streamlit as st
|
| 6 |
import yaml
|
| 7 |
-
from datasets.utils.
|
| 8 |
|
| 9 |
st.set_page_config(
|
| 10 |
page_title="HF Dataset Tagging App",
|
|
@@ -13,9 +14,20 @@ st.set_page_config(
|
|
| 13 |
initial_sidebar_state="auto",
|
| 14 |
)
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
task_set = json.load(open("task_set.json"))
|
| 17 |
license_set = json.load(open("license_set.json"))
|
| 18 |
-
language_set_restricted = json.load(open("language_set.json"))
|
| 19 |
|
| 20 |
multilinguality_set = {
|
| 21 |
"monolingual": "contains a single language",
|
|
@@ -74,30 +86,20 @@ def multiselect(
|
|
| 74 |
format_func: Callable = str,
|
| 75 |
):
|
| 76 |
valid_values, invalid_values = split_known(values, valid_set)
|
| 77 |
-
w.markdown(
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
""".format(
|
| 82 |
-
title=title, errors="" if len(invalid_values) == 0 else f"_Found invalid values:_ `{invalid_values}`"
|
| 83 |
-
)
|
| 84 |
-
)
|
| 85 |
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
|
| 86 |
|
| 87 |
|
| 88 |
-
def validate_dict(state_dict: Dict)
|
| 89 |
try:
|
| 90 |
DatasetMetadata(**state_dict)
|
| 91 |
-
|
| 92 |
except Exception as e:
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
```
|
| 96 |
-
{e}
|
| 97 |
-
```
|
| 98 |
-
You're _very_ welcome to fix these issues and submit a new PR on [`datasets`](https://github.com/huggingface/datasets/)
|
| 99 |
-
"""
|
| 100 |
-
return valid
|
| 101 |
|
| 102 |
|
| 103 |
def new_state():
|
|
@@ -131,15 +133,6 @@ st.sidebar.markdown(
|
|
| 131 |
|
| 132 |
This app aims to make it easier to add structured tags to the datasets present in the library.
|
| 133 |
|
| 134 |
-
Each configuration requires its own tasks, as these often correspond to distinct sub-tasks. However, we provide the opportunity
|
| 135 |
-
to pre-load the tag sets from another dataset or configuration to avoid too much redundancy.
|
| 136 |
-
|
| 137 |
-
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
| 138 |
-
|
| 139 |
-
### Preloading an existing tag set
|
| 140 |
-
|
| 141 |
-
You can load an existing tag set to get started if you want.
|
| 142 |
-
Beware that clicking pre-load will overwrite the current state!
|
| 143 |
"""
|
| 144 |
)
|
| 145 |
|
|
@@ -163,19 +156,23 @@ if leftbtn.button("pre-load"):
|
|
| 163 |
initial_state = existing_tag_sets[preloaded_id]
|
| 164 |
state = initial_state or new_state()
|
| 165 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
| 166 |
-
if
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
| 171 |
|
| 172 |
if preloaded_id is not None and initial_state is not None:
|
| 173 |
-
valid = validate_dict(initial_state)
|
| 174 |
st.sidebar.markdown(
|
| 175 |
f"""
|
| 176 |
---
|
| 177 |
The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
Here is the matching yaml block:
|
| 180 |
|
| 181 |
```yaml
|
|
@@ -235,15 +232,23 @@ if "other" in state["multilinguality"]:
|
|
| 235 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
| 236 |
state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
)
|
| 246 |
-
|
| 247 |
|
| 248 |
leftcol.markdown("### Dataset creators")
|
| 249 |
state["language_creators"] = multiselect(
|
|
@@ -329,12 +334,16 @@ state["size_categories"] = [
|
|
| 329 |
## Show results
|
| 330 |
########################
|
| 331 |
|
| 332 |
-
valid = validate_dict(state)
|
| 333 |
rightcol.markdown(
|
| 334 |
f"""
|
| 335 |
### Finalized tag set
|
| 336 |
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
```yaml
|
| 340 |
{yaml.dump(state)}
|
|
@@ -349,5 +358,4 @@ This is a standalone tool, it is useful to check for errors on an existing tagse
|
|
| 349 |
yamlblock = rightcol.text_area("Input your yaml here")
|
| 350 |
if yamlblock.strip() != "":
|
| 351 |
inputdict = yaml.safe_load(yamlblock)
|
| 352 |
-
|
| 353 |
-
rightcol.markdown(valid)
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
from typing import Callable, Dict, List, Tuple
|
| 4 |
|
| 5 |
+
import langcodes as lc
|
| 6 |
import streamlit as st
|
| 7 |
import yaml
|
| 8 |
+
from datasets.utils.metadata import DatasetMetadata
|
| 9 |
|
| 10 |
st.set_page_config(
|
| 11 |
page_title="HF Dataset Tagging App",
|
|
|
|
| 14 |
initial_sidebar_state="auto",
|
| 15 |
)
|
| 16 |
|
| 17 |
+
# XXX: restyling errors as streamlit does not respect whitespaces on `st.error` and doesn't scroll horizontally, which
|
| 18 |
+
# generally makes things easier when reading error reports
|
| 19 |
+
st.markdown(
|
| 20 |
+
"""
|
| 21 |
+
<style>
|
| 22 |
+
div[role=alert] { overflow-x: scroll}
|
| 23 |
+
div.stAlert p { white-space: pre }
|
| 24 |
+
</style>
|
| 25 |
+
""",
|
| 26 |
+
unsafe_allow_html=True,
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
task_set = json.load(open("task_set.json"))
|
| 30 |
license_set = json.load(open("license_set.json"))
|
|
|
|
| 31 |
|
| 32 |
multilinguality_set = {
|
| 33 |
"monolingual": "contains a single language",
|
|
|
|
| 86 |
format_func: Callable = str,
|
| 87 |
):
|
| 88 |
valid_values, invalid_values = split_known(values, valid_set)
|
| 89 |
+
w.markdown(f"#### {title}")
|
| 90 |
+
if len(invalid_values) > 0:
|
| 91 |
+
w.markdown("Found the following invalid values:")
|
| 92 |
+
w.error(invalid_values)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
return w.multiselect(markdown, valid_set, default=valid_values, format_func=format_func)
|
| 94 |
|
| 95 |
|
| 96 |
+
def validate_dict(w: st.delta_generator.DeltaGenerator, state_dict: Dict):
|
| 97 |
try:
|
| 98 |
DatasetMetadata(**state_dict)
|
| 99 |
+
w.markdown("✅ This is a valid tagset! 🤗")
|
| 100 |
except Exception as e:
|
| 101 |
+
w.markdown("❌ This is an invalid tagset, here are the errors in it:")
|
| 102 |
+
w.error(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
def new_state():
|
|
|
|
| 133 |
|
| 134 |
This app aims to make it easier to add structured tags to the datasets present in the library.
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
"""
|
| 137 |
)
|
| 138 |
|
|
|
|
| 156 |
initial_state = existing_tag_sets[preloaded_id]
|
| 157 |
state = initial_state or new_state()
|
| 158 |
st.experimental_set_query_params(preload_dataset=preloaded_id)
|
| 159 |
+
if sum(len(v) if v is not None else 0 for v in state.values()) > 0:
|
| 160 |
+
if rightbtn.button("flush state"):
|
| 161 |
+
state = new_state()
|
| 162 |
+
initial_state = None
|
| 163 |
+
preloaded_id = None
|
| 164 |
+
st.experimental_set_query_params()
|
| 165 |
|
| 166 |
if preloaded_id is not None and initial_state is not None:
|
|
|
|
| 167 |
st.sidebar.markdown(
|
| 168 |
f"""
|
| 169 |
---
|
| 170 |
The current base tagset is [`{preloaded_id}`](https://huggingface.co/datasets/{preloaded_id})
|
| 171 |
+
"""
|
| 172 |
+
)
|
| 173 |
+
validate_dict(st.sidebar, initial_state)
|
| 174 |
+
st.sidebar.markdown(
|
| 175 |
+
f"""
|
| 176 |
Here is the matching yaml block:
|
| 177 |
|
| 178 |
```yaml
|
|
|
|
| 232 |
st.write(f"Registering other-{other_multilinguality} multilinguality")
|
| 233 |
state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}"
|
| 234 |
|
| 235 |
+
valid_values, invalid_values = list(), list()
|
| 236 |
+
for langtag in state["languages"]:
|
| 237 |
+
try:
|
| 238 |
+
lc.get(langtag)
|
| 239 |
+
valid_values.append(langtag)
|
| 240 |
+
except:
|
| 241 |
+
invalid_values.append(langtag)
|
| 242 |
+
leftcol.markdown("#### Languages")
|
| 243 |
+
if len(invalid_values) > 0:
|
| 244 |
+
leftcol.markdown("Found the following invalid values:")
|
| 245 |
+
leftcol.error(invalid_values)
|
| 246 |
+
|
| 247 |
+
langtags = leftcol.text_area(
|
| 248 |
+
"What languages are represented in the dataset? expected format is BCP47 tags separated for ';' e.g. 'en-US;fr-FR'",
|
| 249 |
+
value=";".join(valid_values),
|
| 250 |
)
|
| 251 |
+
state["languages"] = langtags.split(";")
|
| 252 |
|
| 253 |
leftcol.markdown("### Dataset creators")
|
| 254 |
state["language_creators"] = multiselect(
|
|
|
|
| 334 |
## Show results
|
| 335 |
########################
|
| 336 |
|
|
|
|
| 337 |
rightcol.markdown(
|
| 338 |
f"""
|
| 339 |
### Finalized tag set
|
| 340 |
|
| 341 |
+
"""
|
| 342 |
+
)
|
| 343 |
+
validate_dict(rightcol, state)
|
| 344 |
+
|
| 345 |
+
rightcol.markdown(
|
| 346 |
+
f"""
|
| 347 |
|
| 348 |
```yaml
|
| 349 |
{yaml.dump(state)}
|
|
|
|
| 358 |
yamlblock = rightcol.text_area("Input your yaml here")
|
| 359 |
if yamlblock.strip() != "":
|
| 360 |
inputdict = yaml.safe_load(yamlblock)
|
| 361 |
+
validate_dict(rightcol, inputdict)
|
|
|