import pandas as pd
from statistics import mean
import pandas as pd
import json
import numpy as np
from statistics import mean
import re
from datasets import load_dataset
import os
from collections import defaultdict
from src.envs import API, SAHARA_DATA, SAHARA_RESULTS
TASKS_LIST={
'xlni':'Cross-Lingual Natural Language Inference',
'lid':'Language Identification',
'news': 'News Classification',
'sentiment':'Sentiment Analysis',
'topic':'Topic Classification',
'mt_eng2xx':'Machine Translation - English to African',
'mt_fra2xx':'Machine Translation - French to African',
'mt_xx2xx':'Machine Translation - African to African',
'paraphrase':'Paraphrase',
'summary':'Summarization',
'title':'Title Generation',
'mmlu':'General Knowledge',
'mgsm':'Mathematical Word Problems',
'belebele':'Reading Comprehension',
'squad_qa':'Context-based Question Answering',
'ner':'Named Entity Recognition',
'phrase':'Phrase Chunking',
'pos':'Part-of-Speech Tagging',
}
CLUSTERS = {
"Text Classification": [
'xlni', 'lid', 'news', 'sentiment', 'topic',
],
"Text Generation": [
'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title',
],
"MCCR": [
'mmlu', 'mgsm', 'belebele', 'squad_qa',
],
"Tokens": [
'ner', 'phrase', 'pos',
],
}
ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster]
# ===== Authenticate and Load Data From Private HF Repo =====
def load_private_leaderboard_df():
ds = load_dataset(
path=SAHARA_DATA,
name=None,
data_files=SAHARA_RESULTS,
split="train",
download_mode="force_redownload"
)
return ds.to_pandas()
metrics_list={
'bleu_1k':'spBleu1K',
'accuracy':'Accuracy',
'f1':'Macro-F1',
'exact_match':'Exact Match',
'rougeL':'RougeL',
}
LANG_ISO2NAME = {
'eng': 'English',
'fra': 'French',
# 'ara': 'Arabic',
'amh': 'Amharic',
'ewe': 'Ewe',
'hau': 'Hausa',
'ibo': 'Igbo',
'kin': 'Kinyarwanda',
'lin': 'Lingala',
'lug': 'Ganda',
'orm': 'Oromo',
'sna': 'Shona',
'sot': 'Southern Sotho',
'swa': 'Swahili', 'swh': 'Swahili',
'twi': 'Twi',
'wol': 'Wolof',
'xho': 'Xhosa',
'yor': 'Yoruba',
'zul': 'Zulu',
'afr': 'Afrikaans',
'run': 'Rundi',
'tir': 'Tigrinya',
'som': 'Somali',
'pcm': 'Nigerian Pidgin',
'teo': 'Teso',
'nyn': 'Nyankore/Nyankole',
'lgg': 'Lugbara',
'bem': 'Bemba/Chibemba',
'tsn': 'Tswana',
'bbj': 'Ghomálá',
'mos': 'Moore',
'bam': 'Bambara',
'fon': 'Fon',
'ach': 'Acholi',
'nso': 'Sepedi',
'tso': 'Tsonga',
'fuv': 'Fulfude Nigeria',
'gaz': 'Oromo, West Central',
'kea': 'Kabuverdianu',
'nya': 'Nyanja',
'ssw': 'Swati',
'luo': 'Dholuo/Luo',
'ven': 'Venda',
'kir':"Kirundi",
}
# ===== Build Language Name→ISOs map =====
def build_langname_to_isos(iso2name):
name2isos = defaultdict(set)
for iso, name in iso2name.items():
name2isos[name].add(iso)
return name2isos
LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME)
#show only African langs
LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']])
def get_task_metric_map(df):
mapping = {}
for _, row in df.iterrows():
mapping[row["task"]] = row["metric"]
return mapping
def cluster_average(row, tasks):
vals = []
for t in tasks:
try:
v = float(row[t])
vals.append(v)
except Exception:
continue
return np.mean(vals) if vals else np.nan
def add_medals_to_models(df, score_col="overall score"):
score_float_col = "__score_float"
df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan)
df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True)
def get_rank_symbols(scores):
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
symbols = ["🏆", "🥈", "🥉"]
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
return [score_to_symbol.get(s, "") for s in scores]
df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist())
df['model'] = df['rank_symbol'] + ' ' + df['model']
df = df.drop(columns=['rank_symbol', score_float_col])
return df
def format_cluster_table(df, cluster_tasks, metric_map):
col_order = ["model"] + cluster_tasks
for t in cluster_tasks:
if t not in df.columns:
df[t] = '---'
df = df[col_order]
for t in cluster_tasks:
df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
df["Cluster Score"] = df[cluster_tasks].apply(
lambda row: cluster_average(row, cluster_tasks), axis=1
)
df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
df = df[["model", "Cluster Score"] + cluster_tasks]
# rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks}
rename = {t: f"{TASKS_LIST[t]}
Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks}
df = df.rename(columns=rename)
df = add_medals_to_models(df, score_col="Cluster Score")
return df
def format_main_overall_table(df, metric_map):
main = df.copy()
for cname, tasks in CLUSTERS.items():
main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1)
cluster_cols = list(CLUSTERS.keys())
main["Overall Score"] = main[cluster_cols].apply(
lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1
)
for c in cluster_cols + ["Overall Score"]:
main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
main = main[["model", "Overall Score"] + cluster_cols]
main = add_medals_to_models(main, score_col="Overall Score")
main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True)
return main
def load_leaderboards():
df = load_private_leaderboard_df()
metric_map = get_task_metric_map(df)
main_df = df[df['leaderboard'] == 'main'].copy()
if main_df.empty:
cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS}
main_overall_tab = pd.DataFrame([{"Info": "No data"}])
return cluster_tabs, main_overall_tab, [], {}, df, metric_map
main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index()
cluster_tabs = {}
for cname, tasks in CLUSTERS.items():
cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map)
for t in ALL_TASKS:
if t not in main_tasks_df.columns:
main_tasks_df[t] = np.nan
main_overall_tab = format_main_overall_table(main_tasks_df, metric_map)
all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']])
return cluster_tabs, main_overall_tab, df, metric_map
def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400):
# Remove any column whose name contains "task"
drop_cols = [col for col in df.columns if "task" in col]
df = df.drop(columns=drop_cols, errors="ignore")
df.columns.name = None
html=""
# html = f"""
#
# """
html += df.to_html(index=False, escape=False)
return html
cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards()
def get_lang_table(lang_name):
iso_codes = LANGNAME2ISOS.get(lang_name, [])
if not iso_codes:
return pd.DataFrame([{"Info": "No data for this language"}])
# Find all leaderboards containing any ISO in this language group
pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)")
matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)]
lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy()
if lang_df.empty:
return pd.DataFrame([{"Info": "No data for this language"}])
def make_task_col(row):
lb = row['leaderboard']
task = row['task']
metric = row['metric']
if '-' in lb:
pair_lang = lb.split('-')
pair = lb.replace('-', '_')
# return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}"
return f"{TASKS_LIST[task]}
{LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]}
Metric: {metrics_list[metric]}"
else:
return f"{TASKS_LIST[task]}
Metric: {metrics_list[metric]}"
lang_df['task_col'] = lang_df.apply(make_task_col, axis=1)
table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index()
score_cols = [col for col in table.columns if col != 'model']
for col in score_cols:
table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
def avg_score(row):
vals = []
for col in score_cols:
try:
v = float(row[col])
vals.append(v)
except Exception:
continue
return np.mean(vals) if vals else np.nan
table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))
table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan)
table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True)
def get_rank_symbols(scores):
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
symbols = ["🏆", "🥈", "🥉"]
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
return [score_to_symbol.get(s, "") for s in scores]
table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist())
table['model'] = table['rank_symbol'] + ' ' + table['model']
table = table.drop(columns=['rank_symbol', '__overall_score_float'])
return table