Spaces:
Runtime error
Runtime error
from datasets import load_dataset | |
# Constants values | |
LEADERBOARD_PATH = "atlasia/Open-Arabic-Dialect-Identification-Leaderboard" | |
DATA_PATH = "atlasia/Arabic-LID-Leaderboard" | |
DIALECT_CONFUSION_LEADERBOARD_FILE = "darija_leaderboard_dialect_confusion.json" | |
MULTI_DIALECTS_LEADERBOARD_FILE = "darija_leaderboard_multi_dialects.json" | |
# classification metrics | |
metrics = [ | |
'f1_score', | |
'precision', | |
'recall', | |
'false_positive_rate', | |
'false_negative_rate', | |
'weighted_f1_score', | |
'macro_f1_score', | |
'micro_f1_score', | |
'balanced_accuracy', | |
'matthews_correlation', | |
'specificity', | |
'negative_predictive_value', | |
'n_test_samples', | |
] | |
# Mapping dict from iso code to country name | |
language_mapping_dict = { | |
'ace_Arab': 'Acehnese', | |
'acm_Arab': 'Mesopotamia', # 'Gilit Mesopotamian' | |
'aeb_Arab': 'Tunisia', | |
'ajp_Arab': 'Levantine', # 'South Levantine' | |
'apc_Arab': 'Levantine', | |
'arb_Arab': 'MSA', | |
'arq_Arab': 'Algeria', | |
'ars_Arab': 'Saudi', # Najdi is primarily Saudi Arabian | |
'ary_Arab': 'Morocco', | |
'arz_Arab': 'Egypt', | |
'ayp_Arab': 'Mesopotamia', # 'North Mesopotamian' | |
'azb_Arab': 'Azerbaijan', # South Azerbaijani pertains to this region | |
'bcc_Arab': 'Balochistan', # Southern Balochi is from Balochistan | |
'bjn_Arab': 'Indonesia', # Banjar is spoken in Indonesia | |
'brh_Arab': 'Pakistan', # Brahui is spoken in Pakistan | |
'ckb_Arab': 'Kurdistan', # Central Kurdish is mainly in Iraq | |
'fuv_Arab': 'Nigeria', # Hausa States Fulfulde | |
'glk_Arab': 'Iran', # Gilaki is spoken in Iran | |
'hac_Arab': 'Iran', # Gurani is also primarily spoken in Iran | |
'kas_Arab': 'Kashmir', | |
'knc_Arab': 'Nigeria', # Central Kanuri is in Nigeria | |
'lki_Arab': 'Iran', # Laki is from Iran | |
'lrc_Arab': 'Iran', # Northern Luri is from Iran | |
'min_Arab': 'Indonesia', # Minangkabau is spoken in Indonesia | |
'mzn_Arab': 'Iran', # Mazanderani is spoken in Iran | |
'ota_Arab': 'Turkey', # Ottoman Turkish | |
'pbt_Arab': 'Afghanistan', # Southern Pashto | |
'pnb_Arab': 'Pakistan', # Western Panjabi | |
'sdh_Arab': 'Iraq', # Southern Kurdish | |
'shu_Arab': 'Chad', # Chadian Arabic | |
'skr_Arab': 'Pakistan', # Saraiki | |
'snd_Arab': 'Pakistan', # Sindhi | |
'sus_Arab': 'Guinea', # Susu | |
'tuk_Arab': 'Turkmenistan', # Turkmen | |
'uig_Arab': 'Uighur (China)', # Uighur | |
'urd_Arab': 'Pakistan', # Urdu | |
'uzs_Arab': 'Uzbekistan', # Southern Uzbek | |
'zsm_Arab': 'Malaysia' # Standard Malay | |
} | |
# Default values | |
target_label = "Morocco" | |
is_binary = False | |
# default metrics to display in the multilingual leaderboard | |
default_metrics = [ | |
'f1_score', | |
'false_positive_rate', | |
] | |
# default language to display in one-vs-all leaderboard | |
default_languages = [ | |
'MSA', | |
#'Egypt', | |
#'Algeria', | |
#'Tunisia', | |
#'Levantine', | |
] | |
# Load eval dataset | |
eval_dataset = load_dataset(DATA_PATH, split='test') | |
# Supported dialects | |
all_target_languages = list(eval_dataset.unique("dialect")) | |
supported_dialects = all_target_languages + ['All'] | |
languages_to_display_one_vs_all = all_target_languages # everything except All | |