atlas5301
commited on
Commit
·
87248a0
1
Parent(s):
0a71821
initial commit
Browse files- .gitignore +1 -0
- README.md +0 -12
- app.py +26 -2
- data/long_context.csv +11 -0
- data/zero_context.csv +19 -0
- pages/__init__.py +0 -0
- pages/long_context.py +46 -0
- pages/zero_context.py +46 -0
- utils/style.py +70 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
README.md
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Leaderboard Example
|
3 |
-
emoji: 👀
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: indigo
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.42.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,4 +1,28 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
+
st.set_page_config(
|
4 |
+
page_title="Benchmark Leaderboard",
|
5 |
+
layout="wide",
|
6 |
+
page_icon="📊",
|
7 |
+
initial_sidebar_state="collapsed" # collapses sidebar by default
|
8 |
+
)
|
9 |
+
|
10 |
+
def main():
|
11 |
+
# Create tabs at the top of the page
|
12 |
+
tabs = st.tabs(["Zero Context Leaderboard", "Long Context Leaderboard", "Benchmark Viewer"])
|
13 |
+
|
14 |
+
# Each "with" block corresponds to a content area for that tab.
|
15 |
+
with tabs[0]:
|
16 |
+
from pages import zero_context
|
17 |
+
zero_context.show()
|
18 |
+
|
19 |
+
with tabs[1]:
|
20 |
+
from pages import long_context
|
21 |
+
long_context.show()
|
22 |
+
|
23 |
+
with tabs[2]:
|
24 |
+
from pages import benchmark_viewer
|
25 |
+
benchmark_viewer.show()
|
26 |
+
|
27 |
+
if __name__ == "__main__":
|
28 |
+
main()
|
data/long_context.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,8K,16K,32K,Average↑
|
2 |
+
gemini-1.5-pro-002,1182.43,896.31,812.96,963.9
|
3 |
+
qwen-2.5-72b-instruct,927.33,681.53,563.65,724.17
|
4 |
+
mistral-large-2411,914.49,563.73,319.21,599.14
|
5 |
+
deepseek-v3,935.10,477.02,313.66,575.2
|
6 |
+
gemini-1.5-flash-002,673.88,476.72,377.38,509.3
|
7 |
+
llama-3.1-70b-instruct,479.00,394.50,355.5,409.67
|
8 |
+
minimax-text-01,481.32,359.56,325.95,388.94
|
9 |
+
gpt-4o-mini,401.00,337.81,275.63,338.15
|
10 |
+
qwen-2.5-7b-instruct,248.00,211.50,196.17,218.56
|
11 |
+
llama-3.1-8b-instruct,183.67,149.50,109.45,147.54
|
data/zero_context.csv
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Row Color,Symbolic,Medium,Hard,1st<50% op,1st<10% op,Avg. Acc op≤30,Average↑
|
2 |
+
deepseek-r1,yellow,7280.0,9750.85,8573.8,100,130,0.9427,8534.88
|
3 |
+
o1-mini,yellow,5060.0,6054.91,3738.43,50,90,0.8397,4951.11
|
4 |
+
deepseek-v3,None,4310.0,4100.81,2407.86,24,55,0.6669,3606.22
|
5 |
+
qwq-32b-preview,yellow,3530.0,3205.75,1846.19,21,50,0.5403,2860.65
|
6 |
+
gemini-1.5-pro-002,None,2547.0,3659.59,2318.28,26,45,0.6924,2841.62
|
7 |
+
claude-3.5-sonnet,None,2161.0,3281.8,2115.79,26,40,0.6758,2519.53
|
8 |
+
mistral-large-2411,None,2332.5,2879.92,2310.49,24,50,0.6645,2507.64
|
9 |
+
qwen-2.5-72b-instruct,None,2048.0,2496.81,2016.38,21,40,0.5433,2187.06
|
10 |
+
gpt-4o-2024-11-20,None,2379.0,2457.37,1451.54,18,30,0.5064,2095.97
|
11 |
+
gemini-1.5-flash-002,None,1970.0,1478.75,1274.25,13,30,0.4460,1574.33
|
12 |
+
llama-3.1-70b-instruct,None,1769.0,1650.25,1205.25,15,30,0.4314,1541.50
|
13 |
+
minimax-text-01,green,1618.5,1712.64,1178.51,14,30,0.4213,1503.22
|
14 |
+
llama-3.1-405b-instruct,None,1557.0,1321.54,950.0,11,20,0.3409,1276.18
|
15 |
+
gpt-4o-mini,None,1389.0,1406.5,913.89,12,22,0.3094,1236.46
|
16 |
+
claude-3.5-haiku,None,897.0,1053.16,784.34,10,22,0.2910,911.50
|
17 |
+
qwen-2.5-7b-instruct,None,786.95,886.75,618.5,7,19,0.2257,764.07
|
18 |
+
llama-3.1-8b-instruct,None,462.0,786.5,606.5,6,17,0.2186,618.30
|
19 |
+
jamba-1.5-large,blue,856.0,485.13,466.4,6,26,0.1828,602.51
|
pages/__init__.py
ADDED
File without changes
|
pages/long_context.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from utils.style import style_long_context
|
4 |
+
|
5 |
+
@st.cache_data
|
6 |
+
def load_data():
|
7 |
+
return pd.read_csv("data/long_context.csv")
|
8 |
+
|
9 |
+
def show():
|
10 |
+
st.title("Long Context Leaderboard")
|
11 |
+
|
12 |
+
# Load and style data
|
13 |
+
df = load_data()
|
14 |
+
styled_df = style_long_context(df)
|
15 |
+
|
16 |
+
# Display the dataframe with built-in sort on column click
|
17 |
+
st.dataframe(
|
18 |
+
styled_df,
|
19 |
+
use_container_width=True,
|
20 |
+
height=600,
|
21 |
+
hide_index=True,
|
22 |
+
column_config={
|
23 |
+
"Model": st.column_config.TextColumn(width="large"),
|
24 |
+
"8K": st.column_config.NumberColumn(format="%.2f"),
|
25 |
+
"16K": st.column_config.NumberColumn(format="%.2f"),
|
26 |
+
"32K": st.column_config.NumberColumn(format="%.2f"),
|
27 |
+
"Average↑": st.column_config.NumberColumn(
|
28 |
+
format="%.2f",
|
29 |
+
help="Average across all context lengths"
|
30 |
+
)
|
31 |
+
}
|
32 |
+
)
|
33 |
+
|
34 |
+
# Optionally, keep some explanatory text
|
35 |
+
st.markdown("""
|
36 |
+
**Context Lengths**:
|
37 |
+
- 8K: 8,000 tokens
|
38 |
+
- 16K: 16,000 tokens
|
39 |
+
- 32K: 32,000 tokens
|
40 |
+
|
41 |
+
**Benchmark Details**:
|
42 |
+
- Evaluated on Symbolic, Medium, and Hard subtasks
|
43 |
+
- AUC scores aggregated across context lengths
|
44 |
+
- Larger context evaluations limited by compute constraints
|
45 |
+
- Scores normalized across task complexities
|
46 |
+
""")
|
pages/zero_context.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from utils.style import style_zero_context
|
4 |
+
|
5 |
+
@st.cache_data
|
6 |
+
def load_data():
|
7 |
+
df = pd.read_csv("data/zero_context.csv")
|
8 |
+
if "Row Color" in df.columns:
|
9 |
+
df.drop(columns=["Row Color"], inplace=True)
|
10 |
+
return df
|
11 |
+
|
12 |
+
def show():
|
13 |
+
st.title("Zero Context Leaderboard")
|
14 |
+
# Load data
|
15 |
+
raw_df = load_data()
|
16 |
+
|
17 |
+
# Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
|
18 |
+
styled_df = style_zero_context(raw_df)
|
19 |
+
|
20 |
+
# Directly show the dataframe
|
21 |
+
st.dataframe(
|
22 |
+
styled_df,
|
23 |
+
use_container_width=True,
|
24 |
+
hide_index=True,
|
25 |
+
height=800,
|
26 |
+
column_config={
|
27 |
+
"Model": st.column_config.TextColumn(width="large"),
|
28 |
+
"Symbolic": st.column_config.NumberColumn(format="%.2f"),
|
29 |
+
"Medium": st.column_config.NumberColumn(format="%.2f"),
|
30 |
+
"Hard": st.column_config.NumberColumn(format="%.2f"),
|
31 |
+
"1st<50% op": st.column_config.NumberColumn(format="%.0f"),
|
32 |
+
"1st<10% op": st.column_config.NumberColumn(format="%.0f"),
|
33 |
+
"Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"),
|
34 |
+
"Average↑": st.column_config.NumberColumn(
|
35 |
+
format="%.2f",
|
36 |
+
help="Average across all subsets"
|
37 |
+
)
|
38 |
+
}
|
39 |
+
)
|
40 |
+
|
41 |
+
# You can leave your explanation/description below
|
42 |
+
st.markdown("""
|
43 |
+
**Evaluation Criteria:**
|
44 |
+
- **AUC Calculation:** ...
|
45 |
+
- **Threshold Ops:** ...
|
46 |
+
""")
|
utils/style.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
COLOR_MAP = {
|
4 |
+
"yellow": "background-color: #FFFFCC", # Reasoning models
|
5 |
+
"green": "background-color: #E3FBE9", # Linear attention hybrid
|
6 |
+
"blue": "background-color: #E6F4FF" # SSM hybrid models
|
7 |
+
}
|
8 |
+
|
9 |
+
def style_zero_context(df):
|
10 |
+
"""
|
11 |
+
Similar approach to style_long_context:
|
12 |
+
1) color rows based on model name
|
13 |
+
2) numeric formatting
|
14 |
+
"""
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
# Example color dict, tweak as needed:
|
18 |
+
color_mapping = {
|
19 |
+
"minimax-text-01": COLOR_MAP["green"],
|
20 |
+
"jamba-1.5-large": COLOR_MAP["blue"],
|
21 |
+
"deepseek-r1": COLOR_MAP["yellow"],
|
22 |
+
"o1-mini": COLOR_MAP["yellow"],
|
23 |
+
"qwq-32b-preview": COLOR_MAP["yellow"],
|
24 |
+
# Add any other special-cased models here
|
25 |
+
# "o1-mini": COLOR_MAP["yellow"], etc.
|
26 |
+
}
|
27 |
+
styler = df.style.apply(
|
28 |
+
lambda row: [color_mapping.get(row["Model"], "")]*len(row),
|
29 |
+
axis=1
|
30 |
+
)
|
31 |
+
|
32 |
+
# # Attach custom tooltips (optional)
|
33 |
+
# tooltips = pd.DataFrame("", index=df.index, columns=df.columns)
|
34 |
+
# if "1st<50% op" in df.columns:
|
35 |
+
# tooltips["1st<50% op"] = "First operation number with accuracy <50%"
|
36 |
+
# if "1st<10% op" in df.columns:
|
37 |
+
# tooltips["1st<10% op"] = "First operation number with accuracy <10%"
|
38 |
+
# if "Avg. Acc op≤30" in df.columns:
|
39 |
+
# tooltips["Avg. Acc op≤30"] = "Average accuracy of first 30 operations"
|
40 |
+
# styler = styler.set_tooltips(tooltips)
|
41 |
+
|
42 |
+
# Apply numeric formatting
|
43 |
+
styler = styler.format({
|
44 |
+
"Symbolic": "{:,.2f}", # Format as number with thousands separator and 1 decimal place
|
45 |
+
"Medium": "{:,.2f}", # Format as number with thousands separator and 2 decimal places
|
46 |
+
"Hard": "{:,.2f}", # Format as number with thousands separator and 2 decimal places
|
47 |
+
"1st<50% op": "{:,.0f}", # Format as plain integer (no decimal places)
|
48 |
+
"1st<10% op": "{:,.0f}", # Format as plain integer (no decimal places)
|
49 |
+
"Avg. Acc op≤30": "{:.4f}", # Format with 4 decimal places
|
50 |
+
"Average↑": "{:,.2f}" # Format as number with thousands separator and 2 decimal places
|
51 |
+
})
|
52 |
+
|
53 |
+
|
54 |
+
return styler
|
55 |
+
# Add styling for model types
|
56 |
+
def style_long_context(df):
|
57 |
+
color_mapping = {
|
58 |
+
"minimax-text-01": COLOR_MAP["green"],
|
59 |
+
"jamba-1.5-large": COLOR_MAP["blue"]
|
60 |
+
}
|
61 |
+
|
62 |
+
return df.style.apply(
|
63 |
+
lambda row: [color_mapping.get(row["Model"], "")]*len(row),
|
64 |
+
axis=1
|
65 |
+
).format({
|
66 |
+
"8K": "{:,.2f}",
|
67 |
+
"16K": "{:,.2f}",
|
68 |
+
"32K": "{:,.2f}",
|
69 |
+
"Average↑": "{:,.2f}"
|
70 |
+
})
|