Spaces:
Running
Running
Delete src/helper.py
Browse files- src/helper.py +0 -312
src/helper.py
DELETED
@@ -1,312 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
from statistics import mean
|
3 |
-
import pandas as pd
|
4 |
-
import json
|
5 |
-
import numpy as np
|
6 |
-
from statistics import mean
|
7 |
-
import re
|
8 |
-
from datasets import load_dataset
|
9 |
-
import os
|
10 |
-
from collections import defaultdict
|
11 |
-
from src.envs import API, SAHARA_DATA, SAHARA_RESULTS
|
12 |
-
TASKS_LIST={
|
13 |
-
'xlni':'Cross-Lingual Natural Language Inference',
|
14 |
-
'lid':'Language Identification',
|
15 |
-
'news': 'News Classification',
|
16 |
-
'sentiment':'Sentiment Analysis',
|
17 |
-
'topic':'Topic Classification',
|
18 |
-
'mt_eng2xx':'Machine Translation - English to African',
|
19 |
-
'mt_fra2xx':'Machine Translation - French to African',
|
20 |
-
'mt_xx2xx':'Machine Translation - African to African',
|
21 |
-
'paraphrase':'Paraphrase',
|
22 |
-
'summary':'Summarization',
|
23 |
-
'title':'Title Generation',
|
24 |
-
'mmlu':'General Knowledge',
|
25 |
-
'mgsm':'Mathematical Word Problems',
|
26 |
-
'belebele':'Reading Comprehension',
|
27 |
-
'squad_qa':'Context-based Question Answering',
|
28 |
-
'ner':'Named Entity Recognition',
|
29 |
-
'phrase':'Phrase Chunking',
|
30 |
-
'pos':'Part-of-Speech Tagging',
|
31 |
-
}
|
32 |
-
CLUSTERS = {
|
33 |
-
"Text Classification": [
|
34 |
-
'xlni', 'lid', 'news', 'sentiment', 'topic',
|
35 |
-
],
|
36 |
-
"Text Generation": [
|
37 |
-
'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title',
|
38 |
-
],
|
39 |
-
"MCCR": [
|
40 |
-
'mmlu', 'mgsm', 'belebele', 'squad_qa',
|
41 |
-
],
|
42 |
-
"Tokens": [
|
43 |
-
'ner', 'phrase', 'pos',
|
44 |
-
],
|
45 |
-
}
|
46 |
-
ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster]
|
47 |
-
|
48 |
-
# ===== Authenticate and Load Data From Private HF Repo =====
|
49 |
-
|
50 |
-
def load_private_leaderboard_df():
|
51 |
-
ds = load_dataset(
|
52 |
-
path=SAHARA_DATA,
|
53 |
-
name=None,
|
54 |
-
data_files=SAHARA_RESULTS,
|
55 |
-
split="train",
|
56 |
-
download_mode="force_redownload"
|
57 |
-
)
|
58 |
-
return ds.to_pandas()
|
59 |
-
metrics_list={
|
60 |
-
'bleu_1k':'spBleu<sup>1K</sup>',
|
61 |
-
'accuracy':'Accuracy',
|
62 |
-
'f1':'Macro-F1',
|
63 |
-
'exact_match':'Exact Match',
|
64 |
-
'rougeL':'RougeL',
|
65 |
-
}
|
66 |
-
LANG_ISO2NAME = {
|
67 |
-
'eng': 'English',
|
68 |
-
'fra': 'French',
|
69 |
-
# 'ara': 'Arabic',
|
70 |
-
'amh': 'Amharic',
|
71 |
-
'ewe': 'Ewe',
|
72 |
-
'hau': 'Hausa',
|
73 |
-
'ibo': 'Igbo',
|
74 |
-
'kin': 'Kinyarwanda',
|
75 |
-
'lin': 'Lingala',
|
76 |
-
'lug': 'Ganda',
|
77 |
-
'orm': 'Oromo',
|
78 |
-
'sna': 'Shona',
|
79 |
-
'sot': 'Southern Sotho',
|
80 |
-
'swa': 'Swahili', 'swh': 'Swahili',
|
81 |
-
'twi': 'Twi',
|
82 |
-
'wol': 'Wolof',
|
83 |
-
'xho': 'Xhosa',
|
84 |
-
'yor': 'Yoruba',
|
85 |
-
'zul': 'Zulu',
|
86 |
-
'afr': 'Afrikaans',
|
87 |
-
'run': 'Rundi',
|
88 |
-
'tir': 'Tigrinya',
|
89 |
-
'som': 'Somali',
|
90 |
-
'pcm': 'Nigerian Pidgin',
|
91 |
-
'teo': 'Teso',
|
92 |
-
'nyn': 'Nyankore/Nyankole',
|
93 |
-
'lgg': 'Lugbara',
|
94 |
-
'bem': 'Bemba/Chibemba',
|
95 |
-
'tsn': 'Tswana',
|
96 |
-
'bbj': 'Ghomálá',
|
97 |
-
'mos': 'Moore',
|
98 |
-
'bam': 'Bambara',
|
99 |
-
'fon': 'Fon',
|
100 |
-
'ach': 'Acholi',
|
101 |
-
'nso': 'Sepedi',
|
102 |
-
'tso': 'Tsonga',
|
103 |
-
'fuv': 'Fulfude Nigeria',
|
104 |
-
'gaz': 'Oromo, West Central',
|
105 |
-
'kea': 'Kabuverdianu',
|
106 |
-
'nya': 'Nyanja',
|
107 |
-
'ssw': 'Swati',
|
108 |
-
'luo': 'Dholuo/Luo',
|
109 |
-
'ven': 'Venda',
|
110 |
-
'kir':"Kirundi",
|
111 |
-
}
|
112 |
-
|
113 |
-
# ===== Build Language Name→ISOs map =====
|
114 |
-
def build_langname_to_isos(iso2name):
|
115 |
-
name2isos = defaultdict(set)
|
116 |
-
for iso, name in iso2name.items():
|
117 |
-
name2isos[name].add(iso)
|
118 |
-
return name2isos
|
119 |
-
|
120 |
-
LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME)
|
121 |
-
#show only African langs
|
122 |
-
LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']])
|
123 |
-
|
124 |
-
def get_task_metric_map(df):
|
125 |
-
mapping = {}
|
126 |
-
for _, row in df.iterrows():
|
127 |
-
mapping[row["task"]] = row["metric"]
|
128 |
-
return mapping
|
129 |
-
|
130 |
-
def cluster_average(row, tasks):
|
131 |
-
vals = []
|
132 |
-
for t in tasks:
|
133 |
-
try:
|
134 |
-
v = float(row[t])
|
135 |
-
vals.append(v)
|
136 |
-
except Exception:
|
137 |
-
continue
|
138 |
-
return np.mean(vals) if vals else np.nan
|
139 |
-
|
140 |
-
def add_medals_to_models(df, score_col="overall score"):
|
141 |
-
score_float_col = "__score_float"
|
142 |
-
df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan)
|
143 |
-
df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True)
|
144 |
-
def get_rank_symbols(scores):
|
145 |
-
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
|
146 |
-
symbols = ["🏆", "🥈", "🥉"]
|
147 |
-
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
|
148 |
-
return [score_to_symbol.get(s, "") for s in scores]
|
149 |
-
df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist())
|
150 |
-
df['model'] = df['rank_symbol'] + ' ' + df['model']
|
151 |
-
df = df.drop(columns=['rank_symbol', score_float_col])
|
152 |
-
return df
|
153 |
-
|
154 |
-
def format_cluster_table(df, cluster_tasks, metric_map):
|
155 |
-
col_order = ["model"] + cluster_tasks
|
156 |
-
for t in cluster_tasks:
|
157 |
-
if t not in df.columns:
|
158 |
-
df[t] = '---'
|
159 |
-
df = df[col_order]
|
160 |
-
for t in cluster_tasks:
|
161 |
-
df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
|
162 |
-
df["Cluster Score"] = df[cluster_tasks].apply(
|
163 |
-
lambda row: cluster_average(row, cluster_tasks), axis=1
|
164 |
-
)
|
165 |
-
df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
|
166 |
-
df = df[["model", "Cluster Score"] + cluster_tasks]
|
167 |
-
# rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks}
|
168 |
-
rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks}
|
169 |
-
df = df.rename(columns=rename)
|
170 |
-
df = add_medals_to_models(df, score_col="Cluster Score")
|
171 |
-
return df
|
172 |
-
|
173 |
-
def format_main_overall_table(df, metric_map):
|
174 |
-
main = df.copy()
|
175 |
-
for cname, tasks in CLUSTERS.items():
|
176 |
-
main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1)
|
177 |
-
cluster_cols = list(CLUSTERS.keys())
|
178 |
-
main["Overall Score"] = main[cluster_cols].apply(
|
179 |
-
lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1
|
180 |
-
)
|
181 |
-
for c in cluster_cols + ["Overall Score"]:
|
182 |
-
main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
|
183 |
-
main = main[["model", "Overall Score"] + cluster_cols]
|
184 |
-
main = add_medals_to_models(main, score_col="Overall Score")
|
185 |
-
main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True)
|
186 |
-
return main
|
187 |
-
|
188 |
-
def load_leaderboards():
|
189 |
-
df = load_private_leaderboard_df()
|
190 |
-
metric_map = get_task_metric_map(df)
|
191 |
-
main_df = df[df['leaderboard'] == 'main'].copy()
|
192 |
-
if main_df.empty:
|
193 |
-
cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS}
|
194 |
-
main_overall_tab = pd.DataFrame([{"Info": "No data"}])
|
195 |
-
return cluster_tabs, main_overall_tab, [], {}, df, metric_map
|
196 |
-
main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index()
|
197 |
-
cluster_tabs = {}
|
198 |
-
for cname, tasks in CLUSTERS.items():
|
199 |
-
cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map)
|
200 |
-
for t in ALL_TASKS:
|
201 |
-
if t not in main_tasks_df.columns:
|
202 |
-
main_tasks_df[t] = np.nan
|
203 |
-
main_overall_tab = format_main_overall_table(main_tasks_df, metric_map)
|
204 |
-
all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']])
|
205 |
-
return cluster_tabs, main_overall_tab, df, metric_map
|
206 |
-
|
207 |
-
def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400):
|
208 |
-
# Remove any column whose name contains "task"
|
209 |
-
drop_cols = [col for col in df.columns if "task" in col]
|
210 |
-
df = df.drop(columns=drop_cols, errors="ignore")
|
211 |
-
df.columns.name = None
|
212 |
-
html=""
|
213 |
-
# html = f"""
|
214 |
-
# <style>
|
215 |
-
# .gradio-container-5-34-1 .prose table {{
|
216 |
-
# border-top: 2px solid #dca02a;
|
217 |
-
# border-bottom: 2px solid #dca02a;
|
218 |
-
# margin-bottom:20px;
|
219 |
-
# margin-left: auto;
|
220 |
-
# margin-right: auto;
|
221 |
-
# width: 100%;
|
222 |
-
# border-collapse: collapse;
|
223 |
-
# table-layout: fixed;
|
224 |
-
# }}
|
225 |
-
# .gradio-container-5-34-1 .prose thead tr {{
|
226 |
-
# background: #fffbe9;
|
227 |
-
# border-bottom: 2px solid #dca02a;
|
228 |
-
# }}
|
229 |
-
# .gradio-container-5-34-1 .prose th {{
|
230 |
-
# color: #7d3561;
|
231 |
-
# font-weight: bold;
|
232 |
-
# font-size: 20px;
|
233 |
-
# background: #fffbe9;
|
234 |
-
# padding: 8px 5px;
|
235 |
-
# vertical-align: middle;
|
236 |
-
# border: 0px solid #e0e0e0;
|
237 |
-
# }}
|
238 |
-
# td {{
|
239 |
-
# font-size: 18px;
|
240 |
-
# padding: 8px 5px;
|
241 |
-
# border: 0px solid #e0e0e0;
|
242 |
-
# vertical-align: middle;
|
243 |
-
# }}
|
244 |
-
# th:first-child, td:first-child {{
|
245 |
-
# min-width: {model_col_width}px !important;
|
246 |
-
# max-width: {model_col_width}px !important;
|
247 |
-
# width: {model_col_width}px !important;
|
248 |
-
# text-align: left !important;
|
249 |
-
# }}
|
250 |
-
# th:not(:first-child), td:not(:first-child) {{
|
251 |
-
# min-width: {col_minwidth}px;
|
252 |
-
# max-width: {col_maxwidth}px;
|
253 |
-
# width: auto;
|
254 |
-
# text-align: center;
|
255 |
-
# }}
|
256 |
-
# </style>
|
257 |
-
# """
|
258 |
-
html += df.to_html(index=False, escape=False)
|
259 |
-
return html
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards()
|
264 |
-
|
265 |
-
def get_lang_table(lang_name):
|
266 |
-
iso_codes = LANGNAME2ISOS.get(lang_name, [])
|
267 |
-
if not iso_codes:
|
268 |
-
return pd.DataFrame([{"Info": "No data for this language"}])
|
269 |
-
# Find all leaderboards containing any ISO in this language group
|
270 |
-
pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)")
|
271 |
-
matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)]
|
272 |
-
lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy()
|
273 |
-
if lang_df.empty:
|
274 |
-
return pd.DataFrame([{"Info": "No data for this language"}])
|
275 |
-
def make_task_col(row):
|
276 |
-
lb = row['leaderboard']
|
277 |
-
task = row['task']
|
278 |
-
metric = row['metric']
|
279 |
-
if '-' in lb:
|
280 |
-
pair_lang = lb.split('-')
|
281 |
-
pair = lb.replace('-', '_')
|
282 |
-
# return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}"
|
283 |
-
return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}"
|
284 |
-
else:
|
285 |
-
return f"{TASKS_LIST[task]} <br> Metric: {metrics_list[metric]}"
|
286 |
-
lang_df['task_col'] = lang_df.apply(make_task_col, axis=1)
|
287 |
-
table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index()
|
288 |
-
score_cols = [col for col in table.columns if col != 'model']
|
289 |
-
for col in score_cols:
|
290 |
-
table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
|
291 |
-
def avg_score(row):
|
292 |
-
vals = []
|
293 |
-
for col in score_cols:
|
294 |
-
try:
|
295 |
-
v = float(row[col])
|
296 |
-
vals.append(v)
|
297 |
-
except Exception:
|
298 |
-
continue
|
299 |
-
return np.mean(vals) if vals else np.nan
|
300 |
-
table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))
|
301 |
-
table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan)
|
302 |
-
table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True)
|
303 |
-
def get_rank_symbols(scores):
|
304 |
-
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
|
305 |
-
symbols = ["🏆", "🥈", "🥉"]
|
306 |
-
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
|
307 |
-
return [score_to_symbol.get(s, "") for s in scores]
|
308 |
-
table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist())
|
309 |
-
table['model'] = table['rank_symbol'] + ' ' + table['model']
|
310 |
-
table = table.drop(columns=['rank_symbol', '__overall_score_float'])
|
311 |
-
return table
|
312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|