elmadany commited on
Commit
6121f8b
Β·
verified Β·
1 Parent(s): 3513e8e

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +230 -0
  2. envs.py +12 -0
  3. helper.py +312 -0
  4. requirements.txt +16 -0
  5. src/envs.py +12 -0
  6. src/helper.py +312 -0
app.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+
4
+ from src.helper import *
5
+ # Custom CSS to replicate the Google-style card design from the image
6
+ custom_head_html = """
7
+ <link rel="stylesheet" href="https://africa.dlnlp.ai/sahara/font-awesome/css/font-awesome.min.css">
8
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css">
9
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.2.0/jquery.min.js"></script>
10
+ <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
11
+ <link rel="stylesheet" type="text/css" href="./public/css/style.min.css">
12
+ <script defer src="./public/js/script.js"></script>
13
+ <link rel="preconnect" href="https://fonts.googleapis.com">
14
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
15
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Rubik:wght@400;600&display=swap" rel="stylesheet">
16
+ """
17
+
18
+ new_header_html = """
19
+ <center>
20
+ <br><br><br>
21
+ <img src="https://africa.dlnlp.ai/sahara/img/sahara_web_main.jpg" alt="Sahara logo" width="60%">
22
+ </p>
23
+ </center>
24
+ <br style="height:1px;">
25
+ """
26
+
27
+ google_style_css = """
28
+ div.gradio-container-5-34-1{
29
+ background:#FFFBF5 !important;
30
+ }
31
+
32
+ div.svelte-1nguped {
33
+ background: white !important;
34
+ }
35
+ /* Main Content Area */
36
+ .content-section {
37
+ padding: 60px 0;
38
+ }
39
+ .content-card {
40
+ background-color: #fff;
41
+ border-radius: 12px;
42
+ box-shadow: 0 10px 15px -3px rgba(0,0,0,0.1), 0 4px 6px -2px rgba(0,0,0,0.05);
43
+ padding: 40px;
44
+ margin-bottom: 40px;
45
+ }
46
+ .btn-cite {
47
+ color: #7d3561;
48
+ font-size: 16px;
49
+ margin: 0 3px; /* Add spacing between multiple icons */
50
+ }
51
+ .content-card h4 {
52
+ font-family: "Rubik", sans-serif;
53
+ color: #7d3561;
54
+ }
55
+ .content-card h2 {
56
+ font-family: "Rubik", sans-serif;
57
+ font-size: 30px;
58
+ font-weight: 600;
59
+ line-height: 1.25;
60
+ letter-spacing: -1px;
61
+ color: #2f3b7d;
62
+ text-transform:none;
63
+
64
+ /* font-size: 30px;
65
+ font-weight: bold;
66
+ color: #D97706; /* Brand Orange */
67
+ margin-top: 0;
68
+ margin-bottom: 20px; */
69
+ }
70
+ .content-card p {
71
+ /* font-size: 18px; */
72
+ /* line-height: 1.7; */
73
+ }
74
+
75
+ div.svelte-wv8on1{
76
+ # border: 2px solid #074e4a !important;
77
+ border-top: 0 !important;
78
+ /* background-color: #fff2eb !important; */
79
+ padding: 10px !important;
80
+ }
81
+ .padding.svelte-phx28p {
82
+ padding:0 !important;
83
+ }
84
+
85
+ .tab-wrapper.svelte-1tcem6n.svelte-1tcem6n {
86
+ display: flex;
87
+ align-items: center;
88
+ justify-content: space-between;
89
+ position: relative;
90
+ height: 0 !important;
91
+ padding-bottom: 0 !important;
92
+ }
93
+
94
+
95
+ .selected.svelte-1tcem6n.svelte-1tcem6n {
96
+ background-color: #7d3561 !important;
97
+ color: #fff !important;
98
+ }
99
+ .tabs.svelte-1tcem6n.svelte-1tcem6n {
100
+ /* border: 1px solid #dca02a !important; */
101
+ border-top: 0 !important;
102
+ /* background-color: #dca02a !important; */
103
+ }
104
+ button.svelte-1tcem6n.svelte-1tcem6n {
105
+ color: #7d3561 !important;
106
+ /* border: 1px solid #dca02a !important; */
107
+ font-weight: bold;
108
+ /* font-size: 16px; */
109
+ padding: 8px 5px;
110
+ }
111
+ .tab-container.svelte-1tcem6n.svelte-1tcem6n:after {
112
+ content: "";
113
+ position: absolute;
114
+ bottom: 0;
115
+ left: 0;
116
+ right: 0;
117
+ height: 2px;
118
+ background-color: #7d3561 !important;
119
+ }
120
+
121
+ .gradio-container-5-34-1 .prose table,
122
+ .gradio-container-5-34-1 .prose tr,
123
+ .gradio-container-5-34-1 .prose td,
124
+ .gradio-container-5-34-1 .prose th {
125
+ border: 0 !important;
126
+ border-top: 2px solid #dca02a;
127
+ border-bottom: 2px solid #dca02a;
128
+ }
129
+
130
+
131
+ .gradio-container-5-34-1 .prose table {
132
+ border-top: 2px solid #dca02a !important;
133
+ border-bottom: 2px solid #dca02a !important;
134
+ margin-bottom:20px;
135
+ margin-left: auto;
136
+ margin-right: auto;
137
+ width: 100%;
138
+ border-collapse: collapse;
139
+ table-layout: fixed;
140
+ }
141
+ .gradio-container-5-34-1 .prose thead tr {
142
+ border-bottom: 2px solid #dca02a !important;
143
+ }
144
+ .gradio-container-5-34-1 .prose th {
145
+ color: #7d3561;
146
+ font-weight: bold;
147
+ /* font-size: 20px; */
148
+ padding: 8px 5px;
149
+ vertical-align: middle;
150
+ border: 0 !important;
151
+ }
152
+ .gradio-container-5-34-1 .prose td {
153
+ /* font-size: 18px; */
154
+ padding: 8px 5px;
155
+ border: 0 !important;
156
+ vertical-align: middle;
157
+ }
158
+ .gradio-container-5-34-1 .prose th:first-child,
159
+ .gradio-container-5-34-1 .prose td:first-child {
160
+ min-width: 400px !important;
161
+ max-width: 400px !important;
162
+ width:400px !important;
163
+ text-align: left !important;
164
+ }
165
+ .gradio-container-5-34-1 .prose th:not(:first-child),
166
+ .gradio-container-5-34-1 .prose td:not(:first-child) {
167
+ min-width: 90px;
168
+ max-width: 140px;
169
+ width: auto;
170
+ text-align: center;
171
+ }
172
+ """
173
+
174
+ introduction_text = """
175
+
176
+ """
177
+ # with gr.Blocks(title="Sahara Leaderboard", css=custom_css) as demo:
178
+ # with gr.Blocks(title="Sahara Leaderboard") as demo:
179
+ with gr.Blocks(css=google_style_css) as demo:
180
+ # Use elem_classes to apply our custom CSS to this group
181
+ gr.HTML(new_header_html)
182
+ # gr.Markdown(introduction_text)
183
+ # with gr.Group(elem_classes="content-card"):
184
+ # gr.Markdown(introduction_text)
185
+ # gr.Markdown(
186
+ # "HI # πŸ† Model Evaluation Leaderboard (Clustered, Private HF Dataset)\n"
187
+ # "- Language dropdown uses names, not ISO codes; e.g. Swahili = (swa+swh results).\n"
188
+ # "- Tabs by cluster; each cluster tab shows only its tasks, with fixed column width via CSS HTML tables."
189
+ # )
190
+ with gr.Group(elem_classes="content-card"):
191
+ gr.Markdown("<br>")
192
+ with gr.Tabs():
193
+ # Main leaderboard
194
+ with gr.Tab("Main Leaderboard"):
195
+ gr.HTML("<br><br><center><h2>Main Leaderboard</h2></center><br>")
196
+ gr.HTML(df_to_html(main_overall_tab))
197
+ # Task Clusters leaderboards
198
+ with gr.Tab("Task-Clusters Leaderboards"):
199
+ gr.HTML("<br><br><center><h2>Task-Clusters Leaderboards</h2></center><br>")
200
+ CLUSTERS_NAME=[cname for cname, cdf in cluster_tabs.items()]
201
+
202
+ clusters_dropdown = gr.Dropdown(choices=CLUSTERS_NAME, label="Select Task-CLuster", interactive=True)
203
+ def get_claster_table(cluster_name):
204
+ for cname, cdf in cluster_tabs.items():
205
+ if cname== cluster_name:
206
+ return cdf
207
+ cluster_table_component = gr.HTML(df_to_html(get_claster_table(CLUSTERS_NAME[0])) if CLUSTERS_NAME else "<b>No cluser found</b>")
208
+ def update_cluster_table(cluster_name):
209
+ df = get_claster_table(cluster_name)
210
+ return df_to_html(df)
211
+ clusters_dropdown.change(update_cluster_table, clusters_dropdown, cluster_table_component)
212
+
213
+
214
+ # for cname, cdf in cluster_tabs.items():
215
+ # with gr.Tab(f"{cname}"):
216
+ # gr.HTML(df_to_html(cdf))
217
+ # Languages Leaderboards
218
+ with gr.Tab("Language-Specific Leaderboards"):
219
+ gr.HTML("<br><br><center><h2>Language-Specific Leaderboards</h2></center><br>")
220
+ lang_dropdown = gr.Dropdown(choices=LANG_NAME_LIST, label="Select Language", interactive=True)
221
+ lang_table_component = gr.HTML(df_to_html(get_lang_table(LANG_NAME_LIST[0])) if LANG_NAME_LIST else "<b>No languages found</b>")
222
+ def update_lang_table(lang_name):
223
+ df = get_lang_table(lang_name)
224
+ return df_to_html(df)
225
+ lang_dropdown.change(update_lang_table, lang_dropdown, lang_table_component)
226
+
227
+
228
+
229
+ if __name__ == "__main__":
230
+ demo.launch(share=True)
envs.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+
8
+
9
+ HF_TOKEN = os.environ.get("HF_TOKEN")
10
+ SAHARA_DATA = os.environ.get("SAHARA_DATA")
11
+ SAHARA_RESULTS = os.environ.get("SAHARA_RESULTS")
12
+ API = HfApi(token=HF_TOKEN)
helper.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from statistics import mean
3
+ import pandas as pd
4
+ import json
5
+ import numpy as np
6
+ from statistics import mean
7
+ import re
8
+ from datasets import load_dataset
9
+ import os
10
+ from collections import defaultdict
11
+ from src.envs import API, SAHARA_DATA, SAHARA_RESULTS
12
+ TASKS_LIST={
13
+ 'xlni':'Cross-Lingual Natural Language Inference',
14
+ 'lid':'Language Identification',
15
+ 'news': 'News Classification',
16
+ 'sentiment':'Sentiment Analysis',
17
+ 'topic':'Topic Classification',
18
+ 'mt_eng2xx':'Machine Translation - English to African',
19
+ 'mt_fra2xx':'Machine Translation - French to African',
20
+ 'mt_xx2xx':'Machine Translation - African to African',
21
+ 'paraphrase':'Paraphrase',
22
+ 'summary':'Summarization',
23
+ 'title':'Title Generation',
24
+ 'mmlu':'General Knowledge',
25
+ 'mgsm':'Mathematical Word Problems',
26
+ 'belebele':'Reading Comprehension',
27
+ 'squad_qa':'Context-based Question Answering',
28
+ 'ner':'Named Entity Recognition',
29
+ 'phrase':'Phrase Chunking',
30
+ 'pos':'Part-of-Speech Tagging',
31
+ }
32
+ CLUSTERS = {
33
+ "Text Classification": [
34
+ 'xlni', 'lid', 'news', 'sentiment', 'topic',
35
+ ],
36
+ "Text Generation": [
37
+ 'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title',
38
+ ],
39
+ "MCCR": [
40
+ 'mmlu', 'mgsm', 'belebele', 'squad_qa',
41
+ ],
42
+ "Tokens": [
43
+ 'ner', 'phrase', 'pos',
44
+ ],
45
+ }
46
+ ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster]
47
+
48
+ # ===== Authenticate and Load Data From Private HF Repo =====
49
+
50
+ def load_private_leaderboard_df():
51
+ ds = load_dataset(
52
+ path=SAHARA_DATA,
53
+ name=None,
54
+ data_files=SAHARA_RESULTS,
55
+ split="train",
56
+ download_mode="force_redownload"
57
+ )
58
+ return ds.to_pandas()
59
+ metrics_list={
60
+ 'bleu_1k':'spBleu<sup>1K</sup>',
61
+ 'accuracy':'Accuracy',
62
+ 'f1':'Macro-F1',
63
+ 'exact_match':'Exact Match',
64
+ 'rougeL':'RougeL',
65
+ }
66
+ LANG_ISO2NAME = {
67
+ 'eng': 'English',
68
+ 'fra': 'French',
69
+ # 'ara': 'Arabic',
70
+ 'amh': 'Amharic',
71
+ 'ewe': 'Ewe',
72
+ 'hau': 'Hausa',
73
+ 'ibo': 'Igbo',
74
+ 'kin': 'Kinyarwanda',
75
+ 'lin': 'Lingala',
76
+ 'lug': 'Ganda',
77
+ 'orm': 'Oromo',
78
+ 'sna': 'Shona',
79
+ 'sot': 'Southern Sotho',
80
+ 'swa': 'Swahili', 'swh': 'Swahili',
81
+ 'twi': 'Twi',
82
+ 'wol': 'Wolof',
83
+ 'xho': 'Xhosa',
84
+ 'yor': 'Yoruba',
85
+ 'zul': 'Zulu',
86
+ 'afr': 'Afrikaans',
87
+ 'run': 'Rundi',
88
+ 'tir': 'Tigrinya',
89
+ 'som': 'Somali',
90
+ 'pcm': 'Nigerian Pidgin',
91
+ 'teo': 'Teso',
92
+ 'nyn': 'Nyankore/Nyankole',
93
+ 'lgg': 'Lugbara',
94
+ 'bem': 'Bemba/Chibemba',
95
+ 'tsn': 'Tswana',
96
+ 'bbj': 'GhomΓ‘lΓ‘',
97
+ 'mos': 'Moore',
98
+ 'bam': 'Bambara',
99
+ 'fon': 'Fon',
100
+ 'ach': 'Acholi',
101
+ 'nso': 'Sepedi',
102
+ 'tso': 'Tsonga',
103
+ 'fuv': 'Fulfude Nigeria',
104
+ 'gaz': 'Oromo, West Central',
105
+ 'kea': 'Kabuverdianu',
106
+ 'nya': 'Nyanja',
107
+ 'ssw': 'Swati',
108
+ 'luo': 'Dholuo/Luo',
109
+ 'ven': 'Venda',
110
+ 'kir':"Kirundi",
111
+ }
112
+
113
+ # ===== Build Language Name→ISOs map =====
114
+ def build_langname_to_isos(iso2name):
115
+ name2isos = defaultdict(set)
116
+ for iso, name in iso2name.items():
117
+ name2isos[name].add(iso)
118
+ return name2isos
119
+
120
+ LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME)
121
+ #show only African langs
122
+ LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']])
123
+
124
+ def get_task_metric_map(df):
125
+ mapping = {}
126
+ for _, row in df.iterrows():
127
+ mapping[row["task"]] = row["metric"]
128
+ return mapping
129
+
130
+ def cluster_average(row, tasks):
131
+ vals = []
132
+ for t in tasks:
133
+ try:
134
+ v = float(row[t])
135
+ vals.append(v)
136
+ except Exception:
137
+ continue
138
+ return np.mean(vals) if vals else np.nan
139
+
140
+ def add_medals_to_models(df, score_col="overall score"):
141
+ score_float_col = "__score_float"
142
+ df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan)
143
+ df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True)
144
+ def get_rank_symbols(scores):
145
+ unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
146
+ symbols = ["πŸ†", "πŸ₯ˆ", "πŸ₯‰"]
147
+ score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
148
+ return [score_to_symbol.get(s, "") for s in scores]
149
+ df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist())
150
+ df['model'] = df['rank_symbol'] + ' ' + df['model']
151
+ df = df.drop(columns=['rank_symbol', score_float_col])
152
+ return df
153
+
154
+ def format_cluster_table(df, cluster_tasks, metric_map):
155
+ col_order = ["model"] + cluster_tasks
156
+ for t in cluster_tasks:
157
+ if t not in df.columns:
158
+ df[t] = '---'
159
+ df = df[col_order]
160
+ for t in cluster_tasks:
161
+ df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
162
+ df["Cluster Score"] = df[cluster_tasks].apply(
163
+ lambda row: cluster_average(row, cluster_tasks), axis=1
164
+ )
165
+ df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
166
+ df = df[["model", "Cluster Score"] + cluster_tasks]
167
+ # rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks}
168
+ rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks}
169
+ df = df.rename(columns=rename)
170
+ df = add_medals_to_models(df, score_col="Cluster Score")
171
+ return df
172
+
173
+ def format_main_overall_table(df, metric_map):
174
+ main = df.copy()
175
+ for cname, tasks in CLUSTERS.items():
176
+ main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1)
177
+ cluster_cols = list(CLUSTERS.keys())
178
+ main["Overall Score"] = main[cluster_cols].apply(
179
+ lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1
180
+ )
181
+ for c in cluster_cols + ["Overall Score"]:
182
+ main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
183
+ main = main[["model", "Overall Score"] + cluster_cols]
184
+ main = add_medals_to_models(main, score_col="Overall Score")
185
+ main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True)
186
+ return main
187
+
188
+ def load_leaderboards():
189
+ df = load_private_leaderboard_df()
190
+ metric_map = get_task_metric_map(df)
191
+ main_df = df[df['leaderboard'] == 'main'].copy()
192
+ if main_df.empty:
193
+ cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS}
194
+ main_overall_tab = pd.DataFrame([{"Info": "No data"}])
195
+ return cluster_tabs, main_overall_tab, [], {}, df, metric_map
196
+ main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index()
197
+ cluster_tabs = {}
198
+ for cname, tasks in CLUSTERS.items():
199
+ cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map)
200
+ for t in ALL_TASKS:
201
+ if t not in main_tasks_df.columns:
202
+ main_tasks_df[t] = np.nan
203
+ main_overall_tab = format_main_overall_table(main_tasks_df, metric_map)
204
+ all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']])
205
+ return cluster_tabs, main_overall_tab, df, metric_map
206
+
207
+ def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400):
208
+ # Remove any column whose name contains "task"
209
+ drop_cols = [col for col in df.columns if "task" in col]
210
+ df = df.drop(columns=drop_cols, errors="ignore")
211
+ df.columns.name = None
212
+ html=""
213
+ # html = f"""
214
+ # <style>
215
+ # .gradio-container-5-34-1 .prose table {{
216
+ # border-top: 2px solid #dca02a;
217
+ # border-bottom: 2px solid #dca02a;
218
+ # margin-bottom:20px;
219
+ # margin-left: auto;
220
+ # margin-right: auto;
221
+ # width: 100%;
222
+ # border-collapse: collapse;
223
+ # table-layout: fixed;
224
+ # }}
225
+ # .gradio-container-5-34-1 .prose thead tr {{
226
+ # background: #fffbe9;
227
+ # border-bottom: 2px solid #dca02a;
228
+ # }}
229
+ # .gradio-container-5-34-1 .prose th {{
230
+ # color: #7d3561;
231
+ # font-weight: bold;
232
+ # font-size: 20px;
233
+ # background: #fffbe9;
234
+ # padding: 8px 5px;
235
+ # vertical-align: middle;
236
+ # border: 0px solid #e0e0e0;
237
+ # }}
238
+ # td {{
239
+ # font-size: 18px;
240
+ # padding: 8px 5px;
241
+ # border: 0px solid #e0e0e0;
242
+ # vertical-align: middle;
243
+ # }}
244
+ # th:first-child, td:first-child {{
245
+ # min-width: {model_col_width}px !important;
246
+ # max-width: {model_col_width}px !important;
247
+ # width: {model_col_width}px !important;
248
+ # text-align: left !important;
249
+ # }}
250
+ # th:not(:first-child), td:not(:first-child) {{
251
+ # min-width: {col_minwidth}px;
252
+ # max-width: {col_maxwidth}px;
253
+ # width: auto;
254
+ # text-align: center;
255
+ # }}
256
+ # </style>
257
+ # """
258
+ html += df.to_html(index=False, escape=False)
259
+ return html
260
+
261
+
262
+
263
+ cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards()
264
+
265
+ def get_lang_table(lang_name):
266
+ iso_codes = LANGNAME2ISOS.get(lang_name, [])
267
+ if not iso_codes:
268
+ return pd.DataFrame([{"Info": "No data for this language"}])
269
+ # Find all leaderboards containing any ISO in this language group
270
+ pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)")
271
+ matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)]
272
+ lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy()
273
+ if lang_df.empty:
274
+ return pd.DataFrame([{"Info": "No data for this language"}])
275
+ def make_task_col(row):
276
+ lb = row['leaderboard']
277
+ task = row['task']
278
+ metric = row['metric']
279
+ if '-' in lb:
280
+ pair_lang = lb.split('-')
281
+ pair = lb.replace('-', '_')
282
+ # return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}"
283
+ return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}"
284
+ else:
285
+ return f"{TASKS_LIST[task]} <br> Metric: {metrics_list[metric]}"
286
+ lang_df['task_col'] = lang_df.apply(make_task_col, axis=1)
287
+ table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index()
288
+ score_cols = [col for col in table.columns if col != 'model']
289
+ for col in score_cols:
290
+ table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
291
+ def avg_score(row):
292
+ vals = []
293
+ for col in score_cols:
294
+ try:
295
+ v = float(row[col])
296
+ vals.append(v)
297
+ except Exception:
298
+ continue
299
+ return np.mean(vals) if vals else np.nan
300
+ table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))
301
+ table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan)
302
+ table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True)
303
+ def get_rank_symbols(scores):
304
+ unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
305
+ symbols = ["πŸ†", "πŸ₯ˆ", "πŸ₯‰"]
306
+ score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
307
+ return [score_to_symbol.get(s, "") for s in scores]
308
+ table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist())
309
+ table['model'] = table['rank_symbol'] + ' ' + table['model']
310
+ table = table.drop(columns=['rank_symbol', '__overall_score_float'])
311
+ return table
312
+
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
13
+ tqdm
14
+ transformers
15
+ tokenizers>=0.15.0
16
+ sentencepiece
src/envs.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+
8
+
9
+ HF_TOKEN = os.environ.get("HF_TOKEN")
10
+ SAHARA_DATA = os.environ.get("SAHARA_DATA")
11
+ SAHARA_RESULTS = os.environ.get("SAHARA_RESULTS")
12
+ API = HfApi(token=HF_TOKEN)
src/helper.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from statistics import mean
3
+ import pandas as pd
4
+ import json
5
+ import numpy as np
6
+ from statistics import mean
7
+ import re
8
+ from datasets import load_dataset
9
+ import os
10
+ from collections import defaultdict
11
+ from src.envs import API, SAHARA_DATA, SAHARA_RESULTS
12
+ TASKS_LIST={
13
+ 'xlni':'Cross-Lingual Natural Language Inference',
14
+ 'lid':'Language Identification',
15
+ 'news': 'News Classification',
16
+ 'sentiment':'Sentiment Analysis',
17
+ 'topic':'Topic Classification',
18
+ 'mt_eng2xx':'Machine Translation - English to African',
19
+ 'mt_fra2xx':'Machine Translation - French to African',
20
+ 'mt_xx2xx':'Machine Translation - African to African',
21
+ 'paraphrase':'Paraphrase',
22
+ 'summary':'Summarization',
23
+ 'title':'Title Generation',
24
+ 'mmlu':'General Knowledge',
25
+ 'mgsm':'Mathematical Word Problems',
26
+ 'belebele':'Reading Comprehension',
27
+ 'squad_qa':'Context-based Question Answering',
28
+ 'ner':'Named Entity Recognition',
29
+ 'phrase':'Phrase Chunking',
30
+ 'pos':'Part-of-Speech Tagging',
31
+ }
32
+ CLUSTERS = {
33
+ "Text Classification": [
34
+ 'xlni', 'lid', 'news', 'sentiment', 'topic',
35
+ ],
36
+ "Text Generation": [
37
+ 'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title',
38
+ ],
39
+ "MCCR": [
40
+ 'mmlu', 'mgsm', 'belebele', 'squad_qa',
41
+ ],
42
+ "Tokens": [
43
+ 'ner', 'phrase', 'pos',
44
+ ],
45
+ }
46
+ ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster]
47
+
48
+ # ===== Authenticate and Load Data From Private HF Repo =====
49
+
50
+ def load_private_leaderboard_df():
51
+ ds = load_dataset(
52
+ path=SAHARA_DATA,
53
+ name=None,
54
+ data_files=SAHARA_RESULTS,
55
+ split="train",
56
+ download_mode="force_redownload"
57
+ )
58
+ return ds.to_pandas()
59
+ metrics_list={
60
+ 'bleu_1k':'spBleu<sup>1K</sup>',
61
+ 'accuracy':'Accuracy',
62
+ 'f1':'Macro-F1',
63
+ 'exact_match':'Exact Match',
64
+ 'rougeL':'RougeL',
65
+ }
66
+ LANG_ISO2NAME = {
67
+ 'eng': 'English',
68
+ 'fra': 'French',
69
+ # 'ara': 'Arabic',
70
+ 'amh': 'Amharic',
71
+ 'ewe': 'Ewe',
72
+ 'hau': 'Hausa',
73
+ 'ibo': 'Igbo',
74
+ 'kin': 'Kinyarwanda',
75
+ 'lin': 'Lingala',
76
+ 'lug': 'Ganda',
77
+ 'orm': 'Oromo',
78
+ 'sna': 'Shona',
79
+ 'sot': 'Southern Sotho',
80
+ 'swa': 'Swahili', 'swh': 'Swahili',
81
+ 'twi': 'Twi',
82
+ 'wol': 'Wolof',
83
+ 'xho': 'Xhosa',
84
+ 'yor': 'Yoruba',
85
+ 'zul': 'Zulu',
86
+ 'afr': 'Afrikaans',
87
+ 'run': 'Rundi',
88
+ 'tir': 'Tigrinya',
89
+ 'som': 'Somali',
90
+ 'pcm': 'Nigerian Pidgin',
91
+ 'teo': 'Teso',
92
+ 'nyn': 'Nyankore/Nyankole',
93
+ 'lgg': 'Lugbara',
94
+ 'bem': 'Bemba/Chibemba',
95
+ 'tsn': 'Tswana',
96
+ 'bbj': 'GhomΓ‘lΓ‘',
97
+ 'mos': 'Moore',
98
+ 'bam': 'Bambara',
99
+ 'fon': 'Fon',
100
+ 'ach': 'Acholi',
101
+ 'nso': 'Sepedi',
102
+ 'tso': 'Tsonga',
103
+ 'fuv': 'Fulfude Nigeria',
104
+ 'gaz': 'Oromo, West Central',
105
+ 'kea': 'Kabuverdianu',
106
+ 'nya': 'Nyanja',
107
+ 'ssw': 'Swati',
108
+ 'luo': 'Dholuo/Luo',
109
+ 'ven': 'Venda',
110
+ 'kir':"Kirundi",
111
+ }
112
+
113
+ # ===== Build Language Name→ISOs map =====
114
+ def build_langname_to_isos(iso2name):
115
+ name2isos = defaultdict(set)
116
+ for iso, name in iso2name.items():
117
+ name2isos[name].add(iso)
118
+ return name2isos
119
+
120
+ LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME)
121
+ #show only African langs
122
+ LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']])
123
+
124
+ def get_task_metric_map(df):
125
+ mapping = {}
126
+ for _, row in df.iterrows():
127
+ mapping[row["task"]] = row["metric"]
128
+ return mapping
129
+
130
+ def cluster_average(row, tasks):
131
+ vals = []
132
+ for t in tasks:
133
+ try:
134
+ v = float(row[t])
135
+ vals.append(v)
136
+ except Exception:
137
+ continue
138
+ return np.mean(vals) if vals else np.nan
139
+
140
+ def add_medals_to_models(df, score_col="overall score"):
141
+ score_float_col = "__score_float"
142
+ df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan)
143
+ df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True)
144
+ def get_rank_symbols(scores):
145
+ unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
146
+ symbols = ["πŸ†", "πŸ₯ˆ", "πŸ₯‰"]
147
+ score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
148
+ return [score_to_symbol.get(s, "") for s in scores]
149
+ df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist())
150
+ df['model'] = df['rank_symbol'] + ' ' + df['model']
151
+ df = df.drop(columns=['rank_symbol', score_float_col])
152
+ return df
153
+
154
+ def format_cluster_table(df, cluster_tasks, metric_map):
155
+ col_order = ["model"] + cluster_tasks
156
+ for t in cluster_tasks:
157
+ if t not in df.columns:
158
+ df[t] = '---'
159
+ df = df[col_order]
160
+ for t in cluster_tasks:
161
+ df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
162
+ df["Cluster Score"] = df[cluster_tasks].apply(
163
+ lambda row: cluster_average(row, cluster_tasks), axis=1
164
+ )
165
+ df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
166
+ df = df[["model", "Cluster Score"] + cluster_tasks]
167
+ # rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks}
168
+ rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks}
169
+ df = df.rename(columns=rename)
170
+ df = add_medals_to_models(df, score_col="Cluster Score")
171
+ return df
172
+
173
+ def format_main_overall_table(df, metric_map):
174
+ main = df.copy()
175
+ for cname, tasks in CLUSTERS.items():
176
+ main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1)
177
+ cluster_cols = list(CLUSTERS.keys())
178
+ main["Overall Score"] = main[cluster_cols].apply(
179
+ lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1
180
+ )
181
+ for c in cluster_cols + ["Overall Score"]:
182
+ main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
183
+ main = main[["model", "Overall Score"] + cluster_cols]
184
+ main = add_medals_to_models(main, score_col="Overall Score")
185
+ main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True)
186
+ return main
187
+
188
+ def load_leaderboards():
189
+ df = load_private_leaderboard_df()
190
+ metric_map = get_task_metric_map(df)
191
+ main_df = df[df['leaderboard'] == 'main'].copy()
192
+ if main_df.empty:
193
+ cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS}
194
+ main_overall_tab = pd.DataFrame([{"Info": "No data"}])
195
+ return cluster_tabs, main_overall_tab, [], {}, df, metric_map
196
+ main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index()
197
+ cluster_tabs = {}
198
+ for cname, tasks in CLUSTERS.items():
199
+ cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map)
200
+ for t in ALL_TASKS:
201
+ if t not in main_tasks_df.columns:
202
+ main_tasks_df[t] = np.nan
203
+ main_overall_tab = format_main_overall_table(main_tasks_df, metric_map)
204
+ all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']])
205
+ return cluster_tabs, main_overall_tab, df, metric_map
206
+
207
+ def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400):
208
+ # Remove any column whose name contains "task"
209
+ drop_cols = [col for col in df.columns if "task" in col]
210
+ df = df.drop(columns=drop_cols, errors="ignore")
211
+ df.columns.name = None
212
+ html=""
213
+ # html = f"""
214
+ # <style>
215
+ # .gradio-container-5-34-1 .prose table {{
216
+ # border-top: 2px solid #dca02a;
217
+ # border-bottom: 2px solid #dca02a;
218
+ # margin-bottom:20px;
219
+ # margin-left: auto;
220
+ # margin-right: auto;
221
+ # width: 100%;
222
+ # border-collapse: collapse;
223
+ # table-layout: fixed;
224
+ # }}
225
+ # .gradio-container-5-34-1 .prose thead tr {{
226
+ # background: #fffbe9;
227
+ # border-bottom: 2px solid #dca02a;
228
+ # }}
229
+ # .gradio-container-5-34-1 .prose th {{
230
+ # color: #7d3561;
231
+ # font-weight: bold;
232
+ # font-size: 20px;
233
+ # background: #fffbe9;
234
+ # padding: 8px 5px;
235
+ # vertical-align: middle;
236
+ # border: 0px solid #e0e0e0;
237
+ # }}
238
+ # td {{
239
+ # font-size: 18px;
240
+ # padding: 8px 5px;
241
+ # border: 0px solid #e0e0e0;
242
+ # vertical-align: middle;
243
+ # }}
244
+ # th:first-child, td:first-child {{
245
+ # min-width: {model_col_width}px !important;
246
+ # max-width: {model_col_width}px !important;
247
+ # width: {model_col_width}px !important;
248
+ # text-align: left !important;
249
+ # }}
250
+ # th:not(:first-child), td:not(:first-child) {{
251
+ # min-width: {col_minwidth}px;
252
+ # max-width: {col_maxwidth}px;
253
+ # width: auto;
254
+ # text-align: center;
255
+ # }}
256
+ # </style>
257
+ # """
258
+ html += df.to_html(index=False, escape=False)
259
+ return html
260
+
261
+
262
+
263
+ cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards()
264
+
265
+ def get_lang_table(lang_name):
266
+ iso_codes = LANGNAME2ISOS.get(lang_name, [])
267
+ if not iso_codes:
268
+ return pd.DataFrame([{"Info": "No data for this language"}])
269
+ # Find all leaderboards containing any ISO in this language group
270
+ pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)")
271
+ matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)]
272
+ lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy()
273
+ if lang_df.empty:
274
+ return pd.DataFrame([{"Info": "No data for this language"}])
275
+ def make_task_col(row):
276
+ lb = row['leaderboard']
277
+ task = row['task']
278
+ metric = row['metric']
279
+ if '-' in lb:
280
+ pair_lang = lb.split('-')
281
+ pair = lb.replace('-', '_')
282
+ # return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}"
283
+ return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}"
284
+ else:
285
+ return f"{TASKS_LIST[task]} <br> Metric: {metrics_list[metric]}"
286
+ lang_df['task_col'] = lang_df.apply(make_task_col, axis=1)
287
+ table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index()
288
+ score_cols = [col for col in table.columns if col != 'model']
289
+ for col in score_cols:
290
+ table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
291
+ def avg_score(row):
292
+ vals = []
293
+ for col in score_cols:
294
+ try:
295
+ v = float(row[col])
296
+ vals.append(v)
297
+ except Exception:
298
+ continue
299
+ return np.mean(vals) if vals else np.nan
300
+ table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))
301
+ table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan)
302
+ table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True)
303
+ def get_rank_symbols(scores):
304
+ unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
305
+ symbols = ["πŸ†", "πŸ₯ˆ", "πŸ₯‰"]
306
+ score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
307
+ return [score_to_symbol.get(s, "") for s in scores]
308
+ table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist())
309
+ table['model'] = table['rank_symbol'] + ' ' + table['model']
310
+ table = table.drop(columns=['rank_symbol', '__overall_score_float'])
311
+ return table
312
+