Spaces:
Running
Running
import pandas as pd | |
from statistics import mean | |
import pandas as pd | |
import json | |
import numpy as np | |
from statistics import mean | |
import re | |
from datasets import load_dataset | |
import os | |
from collections import defaultdict | |
from src.envs import API, SAHARA_DATA, SAHARA_RESULTS | |
TASKS_LIST={ | |
'xlni':'Cross-Lingual Natural Language Inference', | |
'lid':'Language Identification', | |
'news': 'News Classification', | |
'sentiment':'Sentiment Analysis', | |
'topic':'Topic Classification', | |
'mt_eng2xx':'Machine Translation - English to African', | |
'mt_fra2xx':'Machine Translation - French to African', | |
'mt_xx2xx':'Machine Translation - African to African', | |
'paraphrase':'Paraphrase', | |
'summary':'Summarization', | |
'title':'Title Generation', | |
'mmlu':'General Knowledge', | |
'mgsm':'Mathematical Word Problems', | |
'belebele':'Reading Comprehension', | |
'squad_qa':'Context-based Question Answering', | |
'ner':'Named Entity Recognition', | |
'phrase':'Phrase Chunking', | |
'pos':'Part-of-Speech Tagging', | |
} | |
CLUSTERS = { | |
"Text Classification": [ | |
'xlni', 'lid', 'news', 'sentiment', 'topic', | |
], | |
"Text Generation": [ | |
'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title', | |
], | |
"MCCR": [ | |
'mmlu', 'mgsm', 'belebele', 'squad_qa', | |
], | |
"Tokens": [ | |
'ner', 'phrase', 'pos', | |
], | |
} | |
ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster] | |
# ===== Authenticate and Load Data From Private HF Repo ===== | |
def load_private_leaderboard_df(): | |
ds = load_dataset( | |
path=SAHARA_DATA, | |
name=None, | |
data_files=SAHARA_RESULTS, | |
split="train", | |
download_mode="force_redownload" | |
) | |
return ds.to_pandas() | |
metrics_list={ | |
'bleu_1k':'spBleu<sup>1K</sup>', | |
'accuracy':'Accuracy', | |
'f1':'Macro-F1', | |
'exact_match':'Exact Match', | |
'rougeL':'RougeL', | |
} | |
LANG_ISO2NAME = { | |
'eng': 'English', | |
'fra': 'French', | |
# 'ara': 'Arabic', | |
'amh': 'Amharic', | |
'ewe': 'Ewe', | |
'hau': 'Hausa', | |
'ibo': 'Igbo', | |
'kin': 'Kinyarwanda', | |
'lin': 'Lingala', | |
'lug': 'Ganda', | |
'orm': 'Oromo', | |
'sna': 'Shona', | |
'sot': 'Southern Sotho', | |
'swa': 'Swahili', 'swh': 'Swahili', | |
'twi': 'Twi', | |
'wol': 'Wolof', | |
'xho': 'Xhosa', | |
'yor': 'Yoruba', | |
'zul': 'Zulu', | |
'afr': 'Afrikaans', | |
'run': 'Rundi', | |
'tir': 'Tigrinya', | |
'som': 'Somali', | |
'pcm': 'Nigerian Pidgin', | |
'teo': 'Teso', | |
'nyn': 'Nyankore/Nyankole', | |
'lgg': 'Lugbara', | |
'bem': 'Bemba/Chibemba', | |
'tsn': 'Tswana', | |
'bbj': 'Ghomálá', | |
'mos': 'Moore', | |
'bam': 'Bambara', | |
'fon': 'Fon', | |
'ach': 'Acholi', | |
'nso': 'Sepedi', | |
'tso': 'Tsonga', | |
'fuv': 'Fulfude Nigeria', | |
'gaz': 'Oromo, West Central', | |
'kea': 'Kabuverdianu', | |
'nya': 'Nyanja', | |
'ssw': 'Swati', | |
'luo': 'Dholuo/Luo', | |
'ven': 'Venda', | |
'kir':"Kirundi", | |
} | |
# ===== Build Language Name→ISOs map ===== | |
def build_langname_to_isos(iso2name): | |
name2isos = defaultdict(set) | |
for iso, name in iso2name.items(): | |
name2isos[name].add(iso) | |
return name2isos | |
LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME) | |
#show only African langs | |
LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']]) | |
def get_task_metric_map(df): | |
mapping = {} | |
for _, row in df.iterrows(): | |
mapping[row["task"]] = row["metric"] | |
return mapping | |
def cluster_average(row, tasks): | |
vals = [] | |
for t in tasks: | |
try: | |
v = float(row[t]) | |
vals.append(v) | |
except Exception: | |
continue | |
return np.mean(vals) if vals else np.nan | |
def add_medals_to_models(df, score_col="overall score"): | |
score_float_col = "__score_float" | |
df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan) | |
df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True) | |
def get_rank_symbols(scores): | |
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True) | |
symbols = ["🏆", "🥈", "🥉"] | |
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])} | |
return [score_to_symbol.get(s, "") for s in scores] | |
df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist()) | |
df['model'] = df['rank_symbol'] + ' ' + df['model'] | |
df = df.drop(columns=['rank_symbol', score_float_col]) | |
return df | |
def format_cluster_table(df, cluster_tasks, metric_map): | |
col_order = ["model"] + cluster_tasks | |
for t in cluster_tasks: | |
if t not in df.columns: | |
df[t] = '---' | |
df = df[col_order] | |
for t in cluster_tasks: | |
df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x) | |
df["Cluster Score"] = df[cluster_tasks].apply( | |
lambda row: cluster_average(row, cluster_tasks), axis=1 | |
) | |
df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
df = df[["model", "Cluster Score"] + cluster_tasks] | |
# rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks} | |
rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks} | |
df = df.rename(columns=rename) | |
df = add_medals_to_models(df, score_col="Cluster Score") | |
return df | |
def format_main_overall_table(df, metric_map): | |
main = df.copy() | |
for cname, tasks in CLUSTERS.items(): | |
main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1) | |
cluster_cols = list(CLUSTERS.keys()) | |
main["Overall Score"] = main[cluster_cols].apply( | |
lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1 | |
) | |
for c in cluster_cols + ["Overall Score"]: | |
main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---") | |
main = main[["model", "Overall Score"] + cluster_cols] | |
main = add_medals_to_models(main, score_col="Overall Score") | |
main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True) | |
return main | |
def load_leaderboards(): | |
df = load_private_leaderboard_df() | |
metric_map = get_task_metric_map(df) | |
main_df = df[df['leaderboard'] == 'main'].copy() | |
if main_df.empty: | |
cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS} | |
main_overall_tab = pd.DataFrame([{"Info": "No data"}]) | |
return cluster_tabs, main_overall_tab, [], {}, df, metric_map | |
main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index() | |
cluster_tabs = {} | |
for cname, tasks in CLUSTERS.items(): | |
cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map) | |
for t in ALL_TASKS: | |
if t not in main_tasks_df.columns: | |
main_tasks_df[t] = np.nan | |
main_overall_tab = format_main_overall_table(main_tasks_df, metric_map) | |
all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']]) | |
return cluster_tabs, main_overall_tab, df, metric_map | |
def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400): | |
# Remove any column whose name contains "task" | |
drop_cols = [col for col in df.columns if "task" in col] | |
df = df.drop(columns=drop_cols, errors="ignore") | |
df.columns.name = None | |
html="" | |
# html = f""" | |
# <style> | |
# .gradio-container-5-34-1 .prose table {{ | |
# border-top: 2px solid #dca02a; | |
# border-bottom: 2px solid #dca02a; | |
# margin-bottom:20px; | |
# margin-left: auto; | |
# margin-right: auto; | |
# width: 100%; | |
# border-collapse: collapse; | |
# table-layout: fixed; | |
# }} | |
# .gradio-container-5-34-1 .prose thead tr {{ | |
# background: #fffbe9; | |
# border-bottom: 2px solid #dca02a; | |
# }} | |
# .gradio-container-5-34-1 .prose th {{ | |
# color: #7d3561; | |
# font-weight: bold; | |
# font-size: 20px; | |
# background: #fffbe9; | |
# padding: 8px 5px; | |
# vertical-align: middle; | |
# border: 0px solid #e0e0e0; | |
# }} | |
# td {{ | |
# font-size: 18px; | |
# padding: 8px 5px; | |
# border: 0px solid #e0e0e0; | |
# vertical-align: middle; | |
# }} | |
# th:first-child, td:first-child {{ | |
# min-width: {model_col_width}px !important; | |
# max-width: {model_col_width}px !important; | |
# width: {model_col_width}px !important; | |
# text-align: left !important; | |
# }} | |
# th:not(:first-child), td:not(:first-child) {{ | |
# min-width: {col_minwidth}px; | |
# max-width: {col_maxwidth}px; | |
# width: auto; | |
# text-align: center; | |
# }} | |
# </style> | |
# """ | |
html += df.to_html(index=False, escape=False) | |
return html | |
cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards() | |
def get_lang_table(lang_name): | |
iso_codes = LANGNAME2ISOS.get(lang_name, []) | |
if not iso_codes: | |
return pd.DataFrame([{"Info": "No data for this language"}]) | |
# Find all leaderboards containing any ISO in this language group | |
pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)") | |
matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)] | |
lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy() | |
if lang_df.empty: | |
return pd.DataFrame([{"Info": "No data for this language"}]) | |
def make_task_col(row): | |
lb = row['leaderboard'] | |
task = row['task'] | |
metric = row['metric'] | |
if '-' in lb: | |
pair_lang = lb.split('-') | |
pair = lb.replace('-', '_') | |
# return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}" | |
return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}" | |
else: | |
return f"{TASKS_LIST[task]} <br> Metric: {metrics_list[metric]}" | |
lang_df['task_col'] = lang_df.apply(make_task_col, axis=1) | |
table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index() | |
score_cols = [col for col in table.columns if col != 'model'] | |
for col in score_cols: | |
table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x) | |
def avg_score(row): | |
vals = [] | |
for col in score_cols: | |
try: | |
v = float(row[col]) | |
vals.append(v) | |
except Exception: | |
continue | |
return np.mean(vals) if vals else np.nan | |
table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")) | |
table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan) | |
table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True) | |
def get_rank_symbols(scores): | |
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True) | |
symbols = ["🏆", "🥈", "🥉"] | |
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])} | |
return [score_to_symbol.get(s, "") for s in scores] | |
table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist()) | |
table['model'] = table['rank_symbol'] + ' ' + table['model'] | |
table = table.drop(columns=['rank_symbol', '__overall_score_float']) | |
return table | |