Spaces:
Running
Running
Jude Khouja
commited on
Commit
·
6226c1b
1
Parent(s):
36ce9ab
Change description and change color or baseline scores
Browse files- data_loader.py +2 -2
- tabs/leaderboard.py +5 -4
- utils.py +31 -0
data_loader.py
CHANGED
@@ -269,8 +269,8 @@ HEADER_CONTENT = (
|
|
269 |
</div>
|
270 |
|
271 |
<div class="description">
|
272 |
-
LingOly-TOO (L2) is a challenging reasoning benchmark designed to
|
273 |
-
|
274 |
<div class="highlight-question">
|
275 |
"How do top LLMs reason on unseen linguistic questions?"
|
276 |
</div>
|
|
|
269 |
</div>
|
270 |
|
271 |
<div class="description">
|
272 |
+
LingOly-TOO (L2) is a challenging linguistics reasoning benchmark designed to counteracts answering without reasoning (e.g. by guessing or memorizing answers).
|
273 |
+
We permute <b>Ling</b>uistics <b>Oly</b>mpiad problems with <b>T</b>emplates and <b>O</b>rthographic <b>O</b>bfuscations. By rewriting (obfuscating) parts of questions and answers, the chance of benchmark leakage in training data is minimized.
|
274 |
<div class="highlight-question">
|
275 |
"How do top LLMs reason on unseen linguistic questions?"
|
276 |
</div>
|
tabs/leaderboard.py
CHANGED
@@ -3,13 +3,14 @@ from data_loader import METHODOLOGY
|
|
3 |
from utils import (
|
4 |
get_rank_badge,
|
5 |
get_score_bar,
|
|
|
6 |
get_type_badge,
|
7 |
)
|
8 |
|
9 |
def filter_leaderboard(df, sort_by):
|
10 |
filtered_df = df.copy()
|
11 |
|
12 |
-
if sort_by == "Score
|
13 |
filtered_df = filtered_df.sort_values(by="Obfuscated score", ascending=False)
|
14 |
else:
|
15 |
filtered_df = filtered_df.sort_values(by="Baseline score", ascending=False)
|
@@ -129,7 +130,7 @@ def filter_leaderboard(df, sort_by):
|
|
129 |
<td class="vendor-cell">{row['Provider']}</td>
|
130 |
<td>{get_type_badge(row['Type'])}</td>
|
131 |
<td class="score-cell">{get_score_bar(row['Obfuscated score'])}</td>
|
132 |
-
<td class="score-cell">{
|
133 |
</tr>
|
134 |
"""
|
135 |
|
@@ -143,8 +144,8 @@ def create_leaderboard_tab(df, HEADER_CONTENT, CARDS):
|
|
143 |
with gr.Row(equal_height=True):
|
144 |
with gr.Column(scale=0.4):
|
145 |
sort_by = gr.Dropdown(
|
146 |
-
choices=["Score
|
147 |
-
value="Score
|
148 |
label="Sort by",
|
149 |
)
|
150 |
|
|
|
3 |
from utils import (
|
4 |
get_rank_badge,
|
5 |
get_score_bar,
|
6 |
+
get_score_bar_secondary,
|
7 |
get_type_badge,
|
8 |
)
|
9 |
|
10 |
def filter_leaderboard(df, sort_by):
|
11 |
filtered_df = df.copy()
|
12 |
|
13 |
+
if sort_by == "Score on obfuscated questions":
|
14 |
filtered_df = filtered_df.sort_values(by="Obfuscated score", ascending=False)
|
15 |
else:
|
16 |
filtered_df = filtered_df.sort_values(by="Baseline score", ascending=False)
|
|
|
130 |
<td class="vendor-cell">{row['Provider']}</td>
|
131 |
<td>{get_type_badge(row['Type'])}</td>
|
132 |
<td class="score-cell">{get_score_bar(row['Obfuscated score'])}</td>
|
133 |
+
<td class="score-cell">{get_score_bar_secondary(row['Baseline score'])}</td>
|
134 |
</tr>
|
135 |
"""
|
136 |
|
|
|
144 |
with gr.Row(equal_height=True):
|
145 |
with gr.Column(scale=0.4):
|
146 |
sort_by = gr.Dropdown(
|
147 |
+
choices=["Score on obfuscated questions", "Score on all questions"],
|
148 |
+
value="Score on obfuscated questions",
|
149 |
label="Sort by",
|
150 |
)
|
151 |
|
utils.py
CHANGED
@@ -67,6 +67,37 @@ def get_score_bar(score):
|
|
67 |
">{width:.1f}</span>
|
68 |
</div>
|
69 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
def get_chart_colors():
|
71 |
# if is_dark_theme():
|
72 |
# return {
|
|
|
67 |
">{width:.1f}</span>
|
68 |
</div>
|
69 |
"""
|
70 |
+
|
71 |
+
def get_score_bar_secondary(score):
|
72 |
+
"""Generate HTML for score bar with gradient styling"""
|
73 |
+
width = score * 100
|
74 |
+
return f"""
|
75 |
+
<div style="display: flex; align-items: center; gap: 12px; width: 100%;">
|
76 |
+
<div style="
|
77 |
+
flex-grow: 1;
|
78 |
+
height: 8px;
|
79 |
+
background: var(--score-bg, rgba(255, 255, 255, 0.1));
|
80 |
+
border-radius: 4px;
|
81 |
+
overflow: hidden;
|
82 |
+
max-width: 200px;
|
83 |
+
">
|
84 |
+
<div style="
|
85 |
+
width: {width}%;
|
86 |
+
height: 100%;
|
87 |
+
background: linear-gradient(90deg, var(--accent-gray, #1f2937), var(--accent-gray-light, #9ca3af));
|
88 |
+
border-radius: 4px;
|
89 |
+
transition: width 0.3s ease;
|
90 |
+
"></div>
|
91 |
+
</div>
|
92 |
+
<span style="
|
93 |
+
font-family: 'SF Mono', monospace;
|
94 |
+
font-weight: 600;
|
95 |
+
color: var(--text-primary, #ffffff);
|
96 |
+
min-width: 60px;
|
97 |
+
">{width:.1f}</span>
|
98 |
+
</div>
|
99 |
+
"""
|
100 |
+
|
101 |
def get_chart_colors():
|
102 |
# if is_dark_theme():
|
103 |
# return {
|