Spaces:
Running
Running
Upload 6 files
Browse files- app.py +230 -0
- envs.py +12 -0
- helper.py +312 -0
- requirements.txt +16 -0
- src/envs.py +12 -0
- src/helper.py +312 -0
app.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
from src.helper import *
|
5 |
+
# Custom CSS to replicate the Google-style card design from the image
|
6 |
+
custom_head_html = """
|
7 |
+
<link rel="stylesheet" href="https://africa.dlnlp.ai/sahara/font-awesome/css/font-awesome.min.css">
|
8 |
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css">
|
9 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.2.0/jquery.min.js"></script>
|
10 |
+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
|
11 |
+
<link rel="stylesheet" type="text/css" href="./public/css/style.min.css">
|
12 |
+
<script defer src="./public/js/script.js"></script>
|
13 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
14 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
15 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Rubik:wght@400;600&display=swap" rel="stylesheet">
|
16 |
+
"""
|
17 |
+
|
18 |
+
new_header_html = """
|
19 |
+
<center>
|
20 |
+
<br><br><br>
|
21 |
+
<img src="https://africa.dlnlp.ai/sahara/img/sahara_web_main.jpg" alt="Sahara logo" width="60%">
|
22 |
+
</p>
|
23 |
+
</center>
|
24 |
+
<br style="height:1px;">
|
25 |
+
"""
|
26 |
+
|
27 |
+
google_style_css = """
|
28 |
+
div.gradio-container-5-34-1{
|
29 |
+
background:#FFFBF5 !important;
|
30 |
+
}
|
31 |
+
|
32 |
+
div.svelte-1nguped {
|
33 |
+
background: white !important;
|
34 |
+
}
|
35 |
+
/* Main Content Area */
|
36 |
+
.content-section {
|
37 |
+
padding: 60px 0;
|
38 |
+
}
|
39 |
+
.content-card {
|
40 |
+
background-color: #fff;
|
41 |
+
border-radius: 12px;
|
42 |
+
box-shadow: 0 10px 15px -3px rgba(0,0,0,0.1), 0 4px 6px -2px rgba(0,0,0,0.05);
|
43 |
+
padding: 40px;
|
44 |
+
margin-bottom: 40px;
|
45 |
+
}
|
46 |
+
.btn-cite {
|
47 |
+
color: #7d3561;
|
48 |
+
font-size: 16px;
|
49 |
+
margin: 0 3px; /* Add spacing between multiple icons */
|
50 |
+
}
|
51 |
+
.content-card h4 {
|
52 |
+
font-family: "Rubik", sans-serif;
|
53 |
+
color: #7d3561;
|
54 |
+
}
|
55 |
+
.content-card h2 {
|
56 |
+
font-family: "Rubik", sans-serif;
|
57 |
+
font-size: 30px;
|
58 |
+
font-weight: 600;
|
59 |
+
line-height: 1.25;
|
60 |
+
letter-spacing: -1px;
|
61 |
+
color: #2f3b7d;
|
62 |
+
text-transform:none;
|
63 |
+
|
64 |
+
/* font-size: 30px;
|
65 |
+
font-weight: bold;
|
66 |
+
color: #D97706; /* Brand Orange */
|
67 |
+
margin-top: 0;
|
68 |
+
margin-bottom: 20px; */
|
69 |
+
}
|
70 |
+
.content-card p {
|
71 |
+
/* font-size: 18px; */
|
72 |
+
/* line-height: 1.7; */
|
73 |
+
}
|
74 |
+
|
75 |
+
div.svelte-wv8on1{
|
76 |
+
# border: 2px solid #074e4a !important;
|
77 |
+
border-top: 0 !important;
|
78 |
+
/* background-color: #fff2eb !important; */
|
79 |
+
padding: 10px !important;
|
80 |
+
}
|
81 |
+
.padding.svelte-phx28p {
|
82 |
+
padding:0 !important;
|
83 |
+
}
|
84 |
+
|
85 |
+
.tab-wrapper.svelte-1tcem6n.svelte-1tcem6n {
|
86 |
+
display: flex;
|
87 |
+
align-items: center;
|
88 |
+
justify-content: space-between;
|
89 |
+
position: relative;
|
90 |
+
height: 0 !important;
|
91 |
+
padding-bottom: 0 !important;
|
92 |
+
}
|
93 |
+
|
94 |
+
|
95 |
+
.selected.svelte-1tcem6n.svelte-1tcem6n {
|
96 |
+
background-color: #7d3561 !important;
|
97 |
+
color: #fff !important;
|
98 |
+
}
|
99 |
+
.tabs.svelte-1tcem6n.svelte-1tcem6n {
|
100 |
+
/* border: 1px solid #dca02a !important; */
|
101 |
+
border-top: 0 !important;
|
102 |
+
/* background-color: #dca02a !important; */
|
103 |
+
}
|
104 |
+
button.svelte-1tcem6n.svelte-1tcem6n {
|
105 |
+
color: #7d3561 !important;
|
106 |
+
/* border: 1px solid #dca02a !important; */
|
107 |
+
font-weight: bold;
|
108 |
+
/* font-size: 16px; */
|
109 |
+
padding: 8px 5px;
|
110 |
+
}
|
111 |
+
.tab-container.svelte-1tcem6n.svelte-1tcem6n:after {
|
112 |
+
content: "";
|
113 |
+
position: absolute;
|
114 |
+
bottom: 0;
|
115 |
+
left: 0;
|
116 |
+
right: 0;
|
117 |
+
height: 2px;
|
118 |
+
background-color: #7d3561 !important;
|
119 |
+
}
|
120 |
+
|
121 |
+
.gradio-container-5-34-1 .prose table,
|
122 |
+
.gradio-container-5-34-1 .prose tr,
|
123 |
+
.gradio-container-5-34-1 .prose td,
|
124 |
+
.gradio-container-5-34-1 .prose th {
|
125 |
+
border: 0 !important;
|
126 |
+
border-top: 2px solid #dca02a;
|
127 |
+
border-bottom: 2px solid #dca02a;
|
128 |
+
}
|
129 |
+
|
130 |
+
|
131 |
+
.gradio-container-5-34-1 .prose table {
|
132 |
+
border-top: 2px solid #dca02a !important;
|
133 |
+
border-bottom: 2px solid #dca02a !important;
|
134 |
+
margin-bottom:20px;
|
135 |
+
margin-left: auto;
|
136 |
+
margin-right: auto;
|
137 |
+
width: 100%;
|
138 |
+
border-collapse: collapse;
|
139 |
+
table-layout: fixed;
|
140 |
+
}
|
141 |
+
.gradio-container-5-34-1 .prose thead tr {
|
142 |
+
border-bottom: 2px solid #dca02a !important;
|
143 |
+
}
|
144 |
+
.gradio-container-5-34-1 .prose th {
|
145 |
+
color: #7d3561;
|
146 |
+
font-weight: bold;
|
147 |
+
/* font-size: 20px; */
|
148 |
+
padding: 8px 5px;
|
149 |
+
vertical-align: middle;
|
150 |
+
border: 0 !important;
|
151 |
+
}
|
152 |
+
.gradio-container-5-34-1 .prose td {
|
153 |
+
/* font-size: 18px; */
|
154 |
+
padding: 8px 5px;
|
155 |
+
border: 0 !important;
|
156 |
+
vertical-align: middle;
|
157 |
+
}
|
158 |
+
.gradio-container-5-34-1 .prose th:first-child,
|
159 |
+
.gradio-container-5-34-1 .prose td:first-child {
|
160 |
+
min-width: 400px !important;
|
161 |
+
max-width: 400px !important;
|
162 |
+
width:400px !important;
|
163 |
+
text-align: left !important;
|
164 |
+
}
|
165 |
+
.gradio-container-5-34-1 .prose th:not(:first-child),
|
166 |
+
.gradio-container-5-34-1 .prose td:not(:first-child) {
|
167 |
+
min-width: 90px;
|
168 |
+
max-width: 140px;
|
169 |
+
width: auto;
|
170 |
+
text-align: center;
|
171 |
+
}
|
172 |
+
"""
|
173 |
+
|
174 |
+
introduction_text = """
|
175 |
+
|
176 |
+
"""
|
177 |
+
# with gr.Blocks(title="Sahara Leaderboard", css=custom_css) as demo:
|
178 |
+
# with gr.Blocks(title="Sahara Leaderboard") as demo:
|
179 |
+
with gr.Blocks(css=google_style_css) as demo:
|
180 |
+
# Use elem_classes to apply our custom CSS to this group
|
181 |
+
gr.HTML(new_header_html)
|
182 |
+
# gr.Markdown(introduction_text)
|
183 |
+
# with gr.Group(elem_classes="content-card"):
|
184 |
+
# gr.Markdown(introduction_text)
|
185 |
+
# gr.Markdown(
|
186 |
+
# "HI # π Model Evaluation Leaderboard (Clustered, Private HF Dataset)\n"
|
187 |
+
# "- Language dropdown uses names, not ISO codes; e.g. Swahili = (swa+swh results).\n"
|
188 |
+
# "- Tabs by cluster; each cluster tab shows only its tasks, with fixed column width via CSS HTML tables."
|
189 |
+
# )
|
190 |
+
with gr.Group(elem_classes="content-card"):
|
191 |
+
gr.Markdown("<br>")
|
192 |
+
with gr.Tabs():
|
193 |
+
# Main leaderboard
|
194 |
+
with gr.Tab("Main Leaderboard"):
|
195 |
+
gr.HTML("<br><br><center><h2>Main Leaderboard</h2></center><br>")
|
196 |
+
gr.HTML(df_to_html(main_overall_tab))
|
197 |
+
# Task Clusters leaderboards
|
198 |
+
with gr.Tab("Task-Clusters Leaderboards"):
|
199 |
+
gr.HTML("<br><br><center><h2>Task-Clusters Leaderboards</h2></center><br>")
|
200 |
+
CLUSTERS_NAME=[cname for cname, cdf in cluster_tabs.items()]
|
201 |
+
|
202 |
+
clusters_dropdown = gr.Dropdown(choices=CLUSTERS_NAME, label="Select Task-CLuster", interactive=True)
|
203 |
+
def get_claster_table(cluster_name):
|
204 |
+
for cname, cdf in cluster_tabs.items():
|
205 |
+
if cname== cluster_name:
|
206 |
+
return cdf
|
207 |
+
cluster_table_component = gr.HTML(df_to_html(get_claster_table(CLUSTERS_NAME[0])) if CLUSTERS_NAME else "<b>No cluser found</b>")
|
208 |
+
def update_cluster_table(cluster_name):
|
209 |
+
df = get_claster_table(cluster_name)
|
210 |
+
return df_to_html(df)
|
211 |
+
clusters_dropdown.change(update_cluster_table, clusters_dropdown, cluster_table_component)
|
212 |
+
|
213 |
+
|
214 |
+
# for cname, cdf in cluster_tabs.items():
|
215 |
+
# with gr.Tab(f"{cname}"):
|
216 |
+
# gr.HTML(df_to_html(cdf))
|
217 |
+
# Languages Leaderboards
|
218 |
+
with gr.Tab("Language-Specific Leaderboards"):
|
219 |
+
gr.HTML("<br><br><center><h2>Language-Specific Leaderboards</h2></center><br>")
|
220 |
+
lang_dropdown = gr.Dropdown(choices=LANG_NAME_LIST, label="Select Language", interactive=True)
|
221 |
+
lang_table_component = gr.HTML(df_to_html(get_lang_table(LANG_NAME_LIST[0])) if LANG_NAME_LIST else "<b>No languages found</b>")
|
222 |
+
def update_lang_table(lang_name):
|
223 |
+
df = get_lang_table(lang_name)
|
224 |
+
return df_to_html(df)
|
225 |
+
lang_dropdown.change(update_lang_table, lang_dropdown, lang_table_component)
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
if __name__ == "__main__":
|
230 |
+
demo.launch(share=True)
|
envs.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
# Info to change for your repository
|
6 |
+
# ----------------------------------
|
7 |
+
|
8 |
+
|
9 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
10 |
+
SAHARA_DATA = os.environ.get("SAHARA_DATA")
|
11 |
+
SAHARA_RESULTS = os.environ.get("SAHARA_RESULTS")
|
12 |
+
API = HfApi(token=HF_TOKEN)
|
helper.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from statistics import mean
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import numpy as np
|
6 |
+
from statistics import mean
|
7 |
+
import re
|
8 |
+
from datasets import load_dataset
|
9 |
+
import os
|
10 |
+
from collections import defaultdict
|
11 |
+
from src.envs import API, SAHARA_DATA, SAHARA_RESULTS
|
12 |
+
TASKS_LIST={
|
13 |
+
'xlni':'Cross-Lingual Natural Language Inference',
|
14 |
+
'lid':'Language Identification',
|
15 |
+
'news': 'News Classification',
|
16 |
+
'sentiment':'Sentiment Analysis',
|
17 |
+
'topic':'Topic Classification',
|
18 |
+
'mt_eng2xx':'Machine Translation - English to African',
|
19 |
+
'mt_fra2xx':'Machine Translation - French to African',
|
20 |
+
'mt_xx2xx':'Machine Translation - African to African',
|
21 |
+
'paraphrase':'Paraphrase',
|
22 |
+
'summary':'Summarization',
|
23 |
+
'title':'Title Generation',
|
24 |
+
'mmlu':'General Knowledge',
|
25 |
+
'mgsm':'Mathematical Word Problems',
|
26 |
+
'belebele':'Reading Comprehension',
|
27 |
+
'squad_qa':'Context-based Question Answering',
|
28 |
+
'ner':'Named Entity Recognition',
|
29 |
+
'phrase':'Phrase Chunking',
|
30 |
+
'pos':'Part-of-Speech Tagging',
|
31 |
+
}
|
32 |
+
CLUSTERS = {
|
33 |
+
"Text Classification": [
|
34 |
+
'xlni', 'lid', 'news', 'sentiment', 'topic',
|
35 |
+
],
|
36 |
+
"Text Generation": [
|
37 |
+
'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title',
|
38 |
+
],
|
39 |
+
"MCCR": [
|
40 |
+
'mmlu', 'mgsm', 'belebele', 'squad_qa',
|
41 |
+
],
|
42 |
+
"Tokens": [
|
43 |
+
'ner', 'phrase', 'pos',
|
44 |
+
],
|
45 |
+
}
|
46 |
+
ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster]
|
47 |
+
|
48 |
+
# ===== Authenticate and Load Data From Private HF Repo =====
|
49 |
+
|
50 |
+
def load_private_leaderboard_df():
|
51 |
+
ds = load_dataset(
|
52 |
+
path=SAHARA_DATA,
|
53 |
+
name=None,
|
54 |
+
data_files=SAHARA_RESULTS,
|
55 |
+
split="train",
|
56 |
+
download_mode="force_redownload"
|
57 |
+
)
|
58 |
+
return ds.to_pandas()
|
59 |
+
metrics_list={
|
60 |
+
'bleu_1k':'spBleu<sup>1K</sup>',
|
61 |
+
'accuracy':'Accuracy',
|
62 |
+
'f1':'Macro-F1',
|
63 |
+
'exact_match':'Exact Match',
|
64 |
+
'rougeL':'RougeL',
|
65 |
+
}
|
66 |
+
LANG_ISO2NAME = {
|
67 |
+
'eng': 'English',
|
68 |
+
'fra': 'French',
|
69 |
+
# 'ara': 'Arabic',
|
70 |
+
'amh': 'Amharic',
|
71 |
+
'ewe': 'Ewe',
|
72 |
+
'hau': 'Hausa',
|
73 |
+
'ibo': 'Igbo',
|
74 |
+
'kin': 'Kinyarwanda',
|
75 |
+
'lin': 'Lingala',
|
76 |
+
'lug': 'Ganda',
|
77 |
+
'orm': 'Oromo',
|
78 |
+
'sna': 'Shona',
|
79 |
+
'sot': 'Southern Sotho',
|
80 |
+
'swa': 'Swahili', 'swh': 'Swahili',
|
81 |
+
'twi': 'Twi',
|
82 |
+
'wol': 'Wolof',
|
83 |
+
'xho': 'Xhosa',
|
84 |
+
'yor': 'Yoruba',
|
85 |
+
'zul': 'Zulu',
|
86 |
+
'afr': 'Afrikaans',
|
87 |
+
'run': 'Rundi',
|
88 |
+
'tir': 'Tigrinya',
|
89 |
+
'som': 'Somali',
|
90 |
+
'pcm': 'Nigerian Pidgin',
|
91 |
+
'teo': 'Teso',
|
92 |
+
'nyn': 'Nyankore/Nyankole',
|
93 |
+
'lgg': 'Lugbara',
|
94 |
+
'bem': 'Bemba/Chibemba',
|
95 |
+
'tsn': 'Tswana',
|
96 |
+
'bbj': 'GhomΓ‘lΓ‘',
|
97 |
+
'mos': 'Moore',
|
98 |
+
'bam': 'Bambara',
|
99 |
+
'fon': 'Fon',
|
100 |
+
'ach': 'Acholi',
|
101 |
+
'nso': 'Sepedi',
|
102 |
+
'tso': 'Tsonga',
|
103 |
+
'fuv': 'Fulfude Nigeria',
|
104 |
+
'gaz': 'Oromo, West Central',
|
105 |
+
'kea': 'Kabuverdianu',
|
106 |
+
'nya': 'Nyanja',
|
107 |
+
'ssw': 'Swati',
|
108 |
+
'luo': 'Dholuo/Luo',
|
109 |
+
'ven': 'Venda',
|
110 |
+
'kir':"Kirundi",
|
111 |
+
}
|
112 |
+
|
113 |
+
# ===== Build Language NameβISOs map =====
|
114 |
+
def build_langname_to_isos(iso2name):
|
115 |
+
name2isos = defaultdict(set)
|
116 |
+
for iso, name in iso2name.items():
|
117 |
+
name2isos[name].add(iso)
|
118 |
+
return name2isos
|
119 |
+
|
120 |
+
LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME)
|
121 |
+
#show only African langs
|
122 |
+
LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']])
|
123 |
+
|
124 |
+
def get_task_metric_map(df):
|
125 |
+
mapping = {}
|
126 |
+
for _, row in df.iterrows():
|
127 |
+
mapping[row["task"]] = row["metric"]
|
128 |
+
return mapping
|
129 |
+
|
130 |
+
def cluster_average(row, tasks):
|
131 |
+
vals = []
|
132 |
+
for t in tasks:
|
133 |
+
try:
|
134 |
+
v = float(row[t])
|
135 |
+
vals.append(v)
|
136 |
+
except Exception:
|
137 |
+
continue
|
138 |
+
return np.mean(vals) if vals else np.nan
|
139 |
+
|
140 |
+
def add_medals_to_models(df, score_col="overall score"):
|
141 |
+
score_float_col = "__score_float"
|
142 |
+
df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan)
|
143 |
+
df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True)
|
144 |
+
def get_rank_symbols(scores):
|
145 |
+
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
|
146 |
+
symbols = ["π", "π₯", "π₯"]
|
147 |
+
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
|
148 |
+
return [score_to_symbol.get(s, "") for s in scores]
|
149 |
+
df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist())
|
150 |
+
df['model'] = df['rank_symbol'] + ' ' + df['model']
|
151 |
+
df = df.drop(columns=['rank_symbol', score_float_col])
|
152 |
+
return df
|
153 |
+
|
154 |
+
def format_cluster_table(df, cluster_tasks, metric_map):
|
155 |
+
col_order = ["model"] + cluster_tasks
|
156 |
+
for t in cluster_tasks:
|
157 |
+
if t not in df.columns:
|
158 |
+
df[t] = '---'
|
159 |
+
df = df[col_order]
|
160 |
+
for t in cluster_tasks:
|
161 |
+
df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
|
162 |
+
df["Cluster Score"] = df[cluster_tasks].apply(
|
163 |
+
lambda row: cluster_average(row, cluster_tasks), axis=1
|
164 |
+
)
|
165 |
+
df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
|
166 |
+
df = df[["model", "Cluster Score"] + cluster_tasks]
|
167 |
+
# rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks}
|
168 |
+
rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks}
|
169 |
+
df = df.rename(columns=rename)
|
170 |
+
df = add_medals_to_models(df, score_col="Cluster Score")
|
171 |
+
return df
|
172 |
+
|
173 |
+
def format_main_overall_table(df, metric_map):
|
174 |
+
main = df.copy()
|
175 |
+
for cname, tasks in CLUSTERS.items():
|
176 |
+
main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1)
|
177 |
+
cluster_cols = list(CLUSTERS.keys())
|
178 |
+
main["Overall Score"] = main[cluster_cols].apply(
|
179 |
+
lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1
|
180 |
+
)
|
181 |
+
for c in cluster_cols + ["Overall Score"]:
|
182 |
+
main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
|
183 |
+
main = main[["model", "Overall Score"] + cluster_cols]
|
184 |
+
main = add_medals_to_models(main, score_col="Overall Score")
|
185 |
+
main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True)
|
186 |
+
return main
|
187 |
+
|
188 |
+
def load_leaderboards():
|
189 |
+
df = load_private_leaderboard_df()
|
190 |
+
metric_map = get_task_metric_map(df)
|
191 |
+
main_df = df[df['leaderboard'] == 'main'].copy()
|
192 |
+
if main_df.empty:
|
193 |
+
cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS}
|
194 |
+
main_overall_tab = pd.DataFrame([{"Info": "No data"}])
|
195 |
+
return cluster_tabs, main_overall_tab, [], {}, df, metric_map
|
196 |
+
main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index()
|
197 |
+
cluster_tabs = {}
|
198 |
+
for cname, tasks in CLUSTERS.items():
|
199 |
+
cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map)
|
200 |
+
for t in ALL_TASKS:
|
201 |
+
if t not in main_tasks_df.columns:
|
202 |
+
main_tasks_df[t] = np.nan
|
203 |
+
main_overall_tab = format_main_overall_table(main_tasks_df, metric_map)
|
204 |
+
all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']])
|
205 |
+
return cluster_tabs, main_overall_tab, df, metric_map
|
206 |
+
|
207 |
+
def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400):
|
208 |
+
# Remove any column whose name contains "task"
|
209 |
+
drop_cols = [col for col in df.columns if "task" in col]
|
210 |
+
df = df.drop(columns=drop_cols, errors="ignore")
|
211 |
+
df.columns.name = None
|
212 |
+
html=""
|
213 |
+
# html = f"""
|
214 |
+
# <style>
|
215 |
+
# .gradio-container-5-34-1 .prose table {{
|
216 |
+
# border-top: 2px solid #dca02a;
|
217 |
+
# border-bottom: 2px solid #dca02a;
|
218 |
+
# margin-bottom:20px;
|
219 |
+
# margin-left: auto;
|
220 |
+
# margin-right: auto;
|
221 |
+
# width: 100%;
|
222 |
+
# border-collapse: collapse;
|
223 |
+
# table-layout: fixed;
|
224 |
+
# }}
|
225 |
+
# .gradio-container-5-34-1 .prose thead tr {{
|
226 |
+
# background: #fffbe9;
|
227 |
+
# border-bottom: 2px solid #dca02a;
|
228 |
+
# }}
|
229 |
+
# .gradio-container-5-34-1 .prose th {{
|
230 |
+
# color: #7d3561;
|
231 |
+
# font-weight: bold;
|
232 |
+
# font-size: 20px;
|
233 |
+
# background: #fffbe9;
|
234 |
+
# padding: 8px 5px;
|
235 |
+
# vertical-align: middle;
|
236 |
+
# border: 0px solid #e0e0e0;
|
237 |
+
# }}
|
238 |
+
# td {{
|
239 |
+
# font-size: 18px;
|
240 |
+
# padding: 8px 5px;
|
241 |
+
# border: 0px solid #e0e0e0;
|
242 |
+
# vertical-align: middle;
|
243 |
+
# }}
|
244 |
+
# th:first-child, td:first-child {{
|
245 |
+
# min-width: {model_col_width}px !important;
|
246 |
+
# max-width: {model_col_width}px !important;
|
247 |
+
# width: {model_col_width}px !important;
|
248 |
+
# text-align: left !important;
|
249 |
+
# }}
|
250 |
+
# th:not(:first-child), td:not(:first-child) {{
|
251 |
+
# min-width: {col_minwidth}px;
|
252 |
+
# max-width: {col_maxwidth}px;
|
253 |
+
# width: auto;
|
254 |
+
# text-align: center;
|
255 |
+
# }}
|
256 |
+
# </style>
|
257 |
+
# """
|
258 |
+
html += df.to_html(index=False, escape=False)
|
259 |
+
return html
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards()
|
264 |
+
|
265 |
+
def get_lang_table(lang_name):
|
266 |
+
iso_codes = LANGNAME2ISOS.get(lang_name, [])
|
267 |
+
if not iso_codes:
|
268 |
+
return pd.DataFrame([{"Info": "No data for this language"}])
|
269 |
+
# Find all leaderboards containing any ISO in this language group
|
270 |
+
pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)")
|
271 |
+
matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)]
|
272 |
+
lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy()
|
273 |
+
if lang_df.empty:
|
274 |
+
return pd.DataFrame([{"Info": "No data for this language"}])
|
275 |
+
def make_task_col(row):
|
276 |
+
lb = row['leaderboard']
|
277 |
+
task = row['task']
|
278 |
+
metric = row['metric']
|
279 |
+
if '-' in lb:
|
280 |
+
pair_lang = lb.split('-')
|
281 |
+
pair = lb.replace('-', '_')
|
282 |
+
# return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}"
|
283 |
+
return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}"
|
284 |
+
else:
|
285 |
+
return f"{TASKS_LIST[task]} <br> Metric: {metrics_list[metric]}"
|
286 |
+
lang_df['task_col'] = lang_df.apply(make_task_col, axis=1)
|
287 |
+
table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index()
|
288 |
+
score_cols = [col for col in table.columns if col != 'model']
|
289 |
+
for col in score_cols:
|
290 |
+
table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
|
291 |
+
def avg_score(row):
|
292 |
+
vals = []
|
293 |
+
for col in score_cols:
|
294 |
+
try:
|
295 |
+
v = float(row[col])
|
296 |
+
vals.append(v)
|
297 |
+
except Exception:
|
298 |
+
continue
|
299 |
+
return np.mean(vals) if vals else np.nan
|
300 |
+
table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))
|
301 |
+
table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan)
|
302 |
+
table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True)
|
303 |
+
def get_rank_symbols(scores):
|
304 |
+
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
|
305 |
+
symbols = ["π", "π₯", "π₯"]
|
306 |
+
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
|
307 |
+
return [score_to_symbol.get(s, "") for s in scores]
|
308 |
+
table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist())
|
309 |
+
table['model'] = table['rank_symbol'] + ' ' + table['model']
|
310 |
+
table = table.drop(columns=['rank_symbol', '__overall_score_float'])
|
311 |
+
return table
|
312 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
APScheduler
|
2 |
+
black
|
3 |
+
datasets
|
4 |
+
gradio
|
5 |
+
gradio[oauth]
|
6 |
+
gradio_leaderboard==0.0.13
|
7 |
+
gradio_client
|
8 |
+
huggingface-hub>=0.18.0
|
9 |
+
matplotlib
|
10 |
+
numpy
|
11 |
+
pandas
|
12 |
+
python-dateutil
|
13 |
+
tqdm
|
14 |
+
transformers
|
15 |
+
tokenizers>=0.15.0
|
16 |
+
sentencepiece
|
src/envs.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
# Info to change for your repository
|
6 |
+
# ----------------------------------
|
7 |
+
|
8 |
+
|
9 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
10 |
+
SAHARA_DATA = os.environ.get("SAHARA_DATA")
|
11 |
+
SAHARA_RESULTS = os.environ.get("SAHARA_RESULTS")
|
12 |
+
API = HfApi(token=HF_TOKEN)
|
src/helper.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from statistics import mean
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import numpy as np
|
6 |
+
from statistics import mean
|
7 |
+
import re
|
8 |
+
from datasets import load_dataset
|
9 |
+
import os
|
10 |
+
from collections import defaultdict
|
11 |
+
from src.envs import API, SAHARA_DATA, SAHARA_RESULTS
|
12 |
+
TASKS_LIST={
|
13 |
+
'xlni':'Cross-Lingual Natural Language Inference',
|
14 |
+
'lid':'Language Identification',
|
15 |
+
'news': 'News Classification',
|
16 |
+
'sentiment':'Sentiment Analysis',
|
17 |
+
'topic':'Topic Classification',
|
18 |
+
'mt_eng2xx':'Machine Translation - English to African',
|
19 |
+
'mt_fra2xx':'Machine Translation - French to African',
|
20 |
+
'mt_xx2xx':'Machine Translation - African to African',
|
21 |
+
'paraphrase':'Paraphrase',
|
22 |
+
'summary':'Summarization',
|
23 |
+
'title':'Title Generation',
|
24 |
+
'mmlu':'General Knowledge',
|
25 |
+
'mgsm':'Mathematical Word Problems',
|
26 |
+
'belebele':'Reading Comprehension',
|
27 |
+
'squad_qa':'Context-based Question Answering',
|
28 |
+
'ner':'Named Entity Recognition',
|
29 |
+
'phrase':'Phrase Chunking',
|
30 |
+
'pos':'Part-of-Speech Tagging',
|
31 |
+
}
|
32 |
+
CLUSTERS = {
|
33 |
+
"Text Classification": [
|
34 |
+
'xlni', 'lid', 'news', 'sentiment', 'topic',
|
35 |
+
],
|
36 |
+
"Text Generation": [
|
37 |
+
'mt_eng2xx', 'mt_fra2xx', 'mt_xx2xx', 'paraphrase', 'summary', 'title',
|
38 |
+
],
|
39 |
+
"MCCR": [
|
40 |
+
'mmlu', 'mgsm', 'belebele', 'squad_qa',
|
41 |
+
],
|
42 |
+
"Tokens": [
|
43 |
+
'ner', 'phrase', 'pos',
|
44 |
+
],
|
45 |
+
}
|
46 |
+
ALL_TASKS = [t for cluster in CLUSTERS.values() for t in cluster]
|
47 |
+
|
48 |
+
# ===== Authenticate and Load Data From Private HF Repo =====
|
49 |
+
|
50 |
+
def load_private_leaderboard_df():
|
51 |
+
ds = load_dataset(
|
52 |
+
path=SAHARA_DATA,
|
53 |
+
name=None,
|
54 |
+
data_files=SAHARA_RESULTS,
|
55 |
+
split="train",
|
56 |
+
download_mode="force_redownload"
|
57 |
+
)
|
58 |
+
return ds.to_pandas()
|
59 |
+
metrics_list={
|
60 |
+
'bleu_1k':'spBleu<sup>1K</sup>',
|
61 |
+
'accuracy':'Accuracy',
|
62 |
+
'f1':'Macro-F1',
|
63 |
+
'exact_match':'Exact Match',
|
64 |
+
'rougeL':'RougeL',
|
65 |
+
}
|
66 |
+
LANG_ISO2NAME = {
|
67 |
+
'eng': 'English',
|
68 |
+
'fra': 'French',
|
69 |
+
# 'ara': 'Arabic',
|
70 |
+
'amh': 'Amharic',
|
71 |
+
'ewe': 'Ewe',
|
72 |
+
'hau': 'Hausa',
|
73 |
+
'ibo': 'Igbo',
|
74 |
+
'kin': 'Kinyarwanda',
|
75 |
+
'lin': 'Lingala',
|
76 |
+
'lug': 'Ganda',
|
77 |
+
'orm': 'Oromo',
|
78 |
+
'sna': 'Shona',
|
79 |
+
'sot': 'Southern Sotho',
|
80 |
+
'swa': 'Swahili', 'swh': 'Swahili',
|
81 |
+
'twi': 'Twi',
|
82 |
+
'wol': 'Wolof',
|
83 |
+
'xho': 'Xhosa',
|
84 |
+
'yor': 'Yoruba',
|
85 |
+
'zul': 'Zulu',
|
86 |
+
'afr': 'Afrikaans',
|
87 |
+
'run': 'Rundi',
|
88 |
+
'tir': 'Tigrinya',
|
89 |
+
'som': 'Somali',
|
90 |
+
'pcm': 'Nigerian Pidgin',
|
91 |
+
'teo': 'Teso',
|
92 |
+
'nyn': 'Nyankore/Nyankole',
|
93 |
+
'lgg': 'Lugbara',
|
94 |
+
'bem': 'Bemba/Chibemba',
|
95 |
+
'tsn': 'Tswana',
|
96 |
+
'bbj': 'GhomΓ‘lΓ‘',
|
97 |
+
'mos': 'Moore',
|
98 |
+
'bam': 'Bambara',
|
99 |
+
'fon': 'Fon',
|
100 |
+
'ach': 'Acholi',
|
101 |
+
'nso': 'Sepedi',
|
102 |
+
'tso': 'Tsonga',
|
103 |
+
'fuv': 'Fulfude Nigeria',
|
104 |
+
'gaz': 'Oromo, West Central',
|
105 |
+
'kea': 'Kabuverdianu',
|
106 |
+
'nya': 'Nyanja',
|
107 |
+
'ssw': 'Swati',
|
108 |
+
'luo': 'Dholuo/Luo',
|
109 |
+
'ven': 'Venda',
|
110 |
+
'kir':"Kirundi",
|
111 |
+
}
|
112 |
+
|
113 |
+
# ===== Build Language NameβISOs map =====
|
114 |
+
def build_langname_to_isos(iso2name):
|
115 |
+
name2isos = defaultdict(set)
|
116 |
+
for iso, name in iso2name.items():
|
117 |
+
name2isos[name].add(iso)
|
118 |
+
return name2isos
|
119 |
+
|
120 |
+
LANGNAME2ISOS = build_langname_to_isos(LANG_ISO2NAME)
|
121 |
+
#show only African langs
|
122 |
+
LANG_NAME_LIST = sorted([lang for lang in LANGNAME2ISOS.keys() if lang not in ['eng', 'fra', 'English', 'French']])
|
123 |
+
|
124 |
+
def get_task_metric_map(df):
|
125 |
+
mapping = {}
|
126 |
+
for _, row in df.iterrows():
|
127 |
+
mapping[row["task"]] = row["metric"]
|
128 |
+
return mapping
|
129 |
+
|
130 |
+
def cluster_average(row, tasks):
|
131 |
+
vals = []
|
132 |
+
for t in tasks:
|
133 |
+
try:
|
134 |
+
v = float(row[t])
|
135 |
+
vals.append(v)
|
136 |
+
except Exception:
|
137 |
+
continue
|
138 |
+
return np.mean(vals) if vals else np.nan
|
139 |
+
|
140 |
+
def add_medals_to_models(df, score_col="overall score"):
|
141 |
+
score_float_col = "__score_float"
|
142 |
+
df[score_float_col] = df[score_col].apply(lambda x: float(x) if x != "---" else np.nan)
|
143 |
+
df = df.sort_values(by=score_float_col, ascending=False, kind="mergesort").reset_index(drop=True)
|
144 |
+
def get_rank_symbols(scores):
|
145 |
+
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
|
146 |
+
symbols = ["π", "π₯", "π₯"]
|
147 |
+
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
|
148 |
+
return [score_to_symbol.get(s, "") for s in scores]
|
149 |
+
df['rank_symbol'] = get_rank_symbols(df[score_float_col].tolist())
|
150 |
+
df['model'] = df['rank_symbol'] + ' ' + df['model']
|
151 |
+
df = df.drop(columns=['rank_symbol', score_float_col])
|
152 |
+
return df
|
153 |
+
|
154 |
+
def format_cluster_table(df, cluster_tasks, metric_map):
|
155 |
+
col_order = ["model"] + cluster_tasks
|
156 |
+
for t in cluster_tasks:
|
157 |
+
if t not in df.columns:
|
158 |
+
df[t] = '---'
|
159 |
+
df = df[col_order]
|
160 |
+
for t in cluster_tasks:
|
161 |
+
df[t] = df[t].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
|
162 |
+
df["Cluster Score"] = df[cluster_tasks].apply(
|
163 |
+
lambda row: cluster_average(row, cluster_tasks), axis=1
|
164 |
+
)
|
165 |
+
df["Cluster Score"] = df["Cluster Score"].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
|
166 |
+
df = df[["model", "Cluster Score"] + cluster_tasks]
|
167 |
+
# rename = {t: f"{t}\n{metric_map.get(t, '')}" for t in cluster_tasks}
|
168 |
+
rename = {t: f"{TASKS_LIST[t]}<br>Metric: {metrics_list[metric_map.get(t, '')]}" for t in cluster_tasks}
|
169 |
+
df = df.rename(columns=rename)
|
170 |
+
df = add_medals_to_models(df, score_col="Cluster Score")
|
171 |
+
return df
|
172 |
+
|
173 |
+
def format_main_overall_table(df, metric_map):
|
174 |
+
main = df.copy()
|
175 |
+
for cname, tasks in CLUSTERS.items():
|
176 |
+
main[cname] = main[tasks].apply(lambda row: cluster_average(row, tasks), axis=1)
|
177 |
+
cluster_cols = list(CLUSTERS.keys())
|
178 |
+
main["Overall Score"] = main[cluster_cols].apply(
|
179 |
+
lambda row: np.nanmean([x for x in row if pd.notna(x)]), axis=1
|
180 |
+
)
|
181 |
+
for c in cluster_cols + ["Overall Score"]:
|
182 |
+
main[c] = main[c].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---")
|
183 |
+
main = main[["model", "Overall Score"] + cluster_cols]
|
184 |
+
main = add_medals_to_models(main, score_col="Overall Score")
|
185 |
+
main.rename(columns={'Overall Score': 'Sahara Score'}, inplace=True)
|
186 |
+
return main
|
187 |
+
|
188 |
+
def load_leaderboards():
|
189 |
+
df = load_private_leaderboard_df()
|
190 |
+
metric_map = get_task_metric_map(df)
|
191 |
+
main_df = df[df['leaderboard'] == 'main'].copy()
|
192 |
+
if main_df.empty:
|
193 |
+
cluster_tabs = {c: pd.DataFrame([{"Info": "No data"}]) for c in CLUSTERS}
|
194 |
+
main_overall_tab = pd.DataFrame([{"Info": "No data"}])
|
195 |
+
return cluster_tabs, main_overall_tab, [], {}, df, metric_map
|
196 |
+
main_tasks_df = main_df.pivot_table(index='model', columns='task', values='score').reset_index()
|
197 |
+
cluster_tabs = {}
|
198 |
+
for cname, tasks in CLUSTERS.items():
|
199 |
+
cluster_tabs[cname] = format_cluster_table(main_tasks_df, tasks, metric_map)
|
200 |
+
for t in ALL_TASKS:
|
201 |
+
if t not in main_tasks_df.columns:
|
202 |
+
main_tasks_df[t] = np.nan
|
203 |
+
main_overall_tab = format_main_overall_table(main_tasks_df, metric_map)
|
204 |
+
all_langs = sorted([lb for lb in df['leaderboard'].unique() if lb not in ['main']])
|
205 |
+
return cluster_tabs, main_overall_tab, df, metric_map
|
206 |
+
|
207 |
+
def df_to_html(df, col_minwidth=90, col_maxwidth=140, model_col_width=400):
|
208 |
+
# Remove any column whose name contains "task"
|
209 |
+
drop_cols = [col for col in df.columns if "task" in col]
|
210 |
+
df = df.drop(columns=drop_cols, errors="ignore")
|
211 |
+
df.columns.name = None
|
212 |
+
html=""
|
213 |
+
# html = f"""
|
214 |
+
# <style>
|
215 |
+
# .gradio-container-5-34-1 .prose table {{
|
216 |
+
# border-top: 2px solid #dca02a;
|
217 |
+
# border-bottom: 2px solid #dca02a;
|
218 |
+
# margin-bottom:20px;
|
219 |
+
# margin-left: auto;
|
220 |
+
# margin-right: auto;
|
221 |
+
# width: 100%;
|
222 |
+
# border-collapse: collapse;
|
223 |
+
# table-layout: fixed;
|
224 |
+
# }}
|
225 |
+
# .gradio-container-5-34-1 .prose thead tr {{
|
226 |
+
# background: #fffbe9;
|
227 |
+
# border-bottom: 2px solid #dca02a;
|
228 |
+
# }}
|
229 |
+
# .gradio-container-5-34-1 .prose th {{
|
230 |
+
# color: #7d3561;
|
231 |
+
# font-weight: bold;
|
232 |
+
# font-size: 20px;
|
233 |
+
# background: #fffbe9;
|
234 |
+
# padding: 8px 5px;
|
235 |
+
# vertical-align: middle;
|
236 |
+
# border: 0px solid #e0e0e0;
|
237 |
+
# }}
|
238 |
+
# td {{
|
239 |
+
# font-size: 18px;
|
240 |
+
# padding: 8px 5px;
|
241 |
+
# border: 0px solid #e0e0e0;
|
242 |
+
# vertical-align: middle;
|
243 |
+
# }}
|
244 |
+
# th:first-child, td:first-child {{
|
245 |
+
# min-width: {model_col_width}px !important;
|
246 |
+
# max-width: {model_col_width}px !important;
|
247 |
+
# width: {model_col_width}px !important;
|
248 |
+
# text-align: left !important;
|
249 |
+
# }}
|
250 |
+
# th:not(:first-child), td:not(:first-child) {{
|
251 |
+
# min-width: {col_minwidth}px;
|
252 |
+
# max-width: {col_maxwidth}px;
|
253 |
+
# width: auto;
|
254 |
+
# text-align: center;
|
255 |
+
# }}
|
256 |
+
# </style>
|
257 |
+
# """
|
258 |
+
html += df.to_html(index=False, escape=False)
|
259 |
+
return html
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
cluster_tabs, main_overall_tab, all_df, metric_map = load_leaderboards()
|
264 |
+
|
265 |
+
def get_lang_table(lang_name):
|
266 |
+
iso_codes = LANGNAME2ISOS.get(lang_name, [])
|
267 |
+
if not iso_codes:
|
268 |
+
return pd.DataFrame([{"Info": "No data for this language"}])
|
269 |
+
# Find all leaderboards containing any ISO in this language group
|
270 |
+
pattern = re.compile(r"(^|-)(" + "|".join(re.escape(iso) for iso in iso_codes) + r")(-|$)")
|
271 |
+
matched_langs = [lb for lb in all_df['leaderboard'].unique() if lb not in ['main'] and pattern.search(lb)]
|
272 |
+
lang_df = all_df[all_df['leaderboard'].isin(matched_langs)].copy()
|
273 |
+
if lang_df.empty:
|
274 |
+
return pd.DataFrame([{"Info": "No data for this language"}])
|
275 |
+
def make_task_col(row):
|
276 |
+
lb = row['leaderboard']
|
277 |
+
task = row['task']
|
278 |
+
metric = row['metric']
|
279 |
+
if '-' in lb:
|
280 |
+
pair_lang = lb.split('-')
|
281 |
+
pair = lb.replace('-', '_')
|
282 |
+
# return f"{TASKS_LIST[task]}({task}) {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} ({pair})\n{metric}"
|
283 |
+
return f"{TASKS_LIST[task]} <br> {LANG_ISO2NAME[pair_lang[0]]} to {LANG_ISO2NAME[pair_lang[1]]} <br> Metric: {metrics_list[metric]}"
|
284 |
+
else:
|
285 |
+
return f"{TASKS_LIST[task]} <br> Metric: {metrics_list[metric]}"
|
286 |
+
lang_df['task_col'] = lang_df.apply(make_task_col, axis=1)
|
287 |
+
table = lang_df.pivot_table(index='model', columns='task_col', values='score').reset_index()
|
288 |
+
score_cols = [col for col in table.columns if col != 'model']
|
289 |
+
for col in score_cols:
|
290 |
+
table[col] = table[col].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float, np.integer, np.floating)) else x)
|
291 |
+
def avg_score(row):
|
292 |
+
vals = []
|
293 |
+
for col in score_cols:
|
294 |
+
try:
|
295 |
+
v = float(row[col])
|
296 |
+
vals.append(v)
|
297 |
+
except Exception:
|
298 |
+
continue
|
299 |
+
return np.mean(vals) if vals else np.nan
|
300 |
+
table.insert(1, 'Language Score', table.apply(avg_score, axis=1).apply(lambda x: f"{x:.2f}" if pd.notna(x) else "---"))
|
301 |
+
table['__overall_score_float'] = table['Language Score'].apply(lambda x: float(x) if x != "---" else np.nan)
|
302 |
+
table = table.sort_values(by='__overall_score_float', ascending=False, kind="mergesort").reset_index(drop=True)
|
303 |
+
def get_rank_symbols(scores):
|
304 |
+
unique_scores = sorted(set([s for s in scores if not pd.isna(s)]), reverse=True)
|
305 |
+
symbols = ["π", "π₯", "π₯"]
|
306 |
+
score_to_symbol = {s: symbols[i] for i, s in enumerate(unique_scores[:3])}
|
307 |
+
return [score_to_symbol.get(s, "") for s in scores]
|
308 |
+
table['rank_symbol'] = get_rank_symbols(table['__overall_score_float'].tolist())
|
309 |
+
table['model'] = table['rank_symbol'] + ' ' + table['model']
|
310 |
+
table = table.drop(columns=['rank_symbol', '__overall_score_float'])
|
311 |
+
return table
|
312 |
+
|