atlas5301 commited on
Commit
87248a0
·
1 Parent(s): 0a71821

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
README.md DELETED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Leaderboard Example
3
- emoji: 👀
4
- colorFrom: pink
5
- colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.42.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,4 +1,28 @@
1
  import streamlit as st
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
3
+ st.set_page_config(
4
+ page_title="Benchmark Leaderboard",
5
+ layout="wide",
6
+ page_icon="📊",
7
+ initial_sidebar_state="collapsed" # collapses sidebar by default
8
+ )
9
+
10
+ def main():
11
+ # Create tabs at the top of the page
12
+ tabs = st.tabs(["Zero Context Leaderboard", "Long Context Leaderboard", "Benchmark Viewer"])
13
+
14
+ # Each "with" block corresponds to a content area for that tab.
15
+ with tabs[0]:
16
+ from pages import zero_context
17
+ zero_context.show()
18
+
19
+ with tabs[1]:
20
+ from pages import long_context
21
+ long_context.show()
22
+
23
+ with tabs[2]:
24
+ from pages import benchmark_viewer
25
+ benchmark_viewer.show()
26
+
27
+ if __name__ == "__main__":
28
+ main()
data/long_context.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,8K,16K,32K,Average↑
2
+ gemini-1.5-pro-002,1182.43,896.31,812.96,963.9
3
+ qwen-2.5-72b-instruct,927.33,681.53,563.65,724.17
4
+ mistral-large-2411,914.49,563.73,319.21,599.14
5
+ deepseek-v3,935.10,477.02,313.66,575.2
6
+ gemini-1.5-flash-002,673.88,476.72,377.38,509.3
7
+ llama-3.1-70b-instruct,479.00,394.50,355.5,409.67
8
+ minimax-text-01,481.32,359.56,325.95,388.94
9
+ gpt-4o-mini,401.00,337.81,275.63,338.15
10
+ qwen-2.5-7b-instruct,248.00,211.50,196.17,218.56
11
+ llama-3.1-8b-instruct,183.67,149.50,109.45,147.54
data/zero_context.csv ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Row Color,Symbolic,Medium,Hard,1st<50% op,1st<10% op,Avg. Acc op≤30,Average↑
2
+ deepseek-r1,yellow,7280.0,9750.85,8573.8,100,130,0.9427,8534.88
3
+ o1-mini,yellow,5060.0,6054.91,3738.43,50,90,0.8397,4951.11
4
+ deepseek-v3,None,4310.0,4100.81,2407.86,24,55,0.6669,3606.22
5
+ qwq-32b-preview,yellow,3530.0,3205.75,1846.19,21,50,0.5403,2860.65
6
+ gemini-1.5-pro-002,None,2547.0,3659.59,2318.28,26,45,0.6924,2841.62
7
+ claude-3.5-sonnet,None,2161.0,3281.8,2115.79,26,40,0.6758,2519.53
8
+ mistral-large-2411,None,2332.5,2879.92,2310.49,24,50,0.6645,2507.64
9
+ qwen-2.5-72b-instruct,None,2048.0,2496.81,2016.38,21,40,0.5433,2187.06
10
+ gpt-4o-2024-11-20,None,2379.0,2457.37,1451.54,18,30,0.5064,2095.97
11
+ gemini-1.5-flash-002,None,1970.0,1478.75,1274.25,13,30,0.4460,1574.33
12
+ llama-3.1-70b-instruct,None,1769.0,1650.25,1205.25,15,30,0.4314,1541.50
13
+ minimax-text-01,green,1618.5,1712.64,1178.51,14,30,0.4213,1503.22
14
+ llama-3.1-405b-instruct,None,1557.0,1321.54,950.0,11,20,0.3409,1276.18
15
+ gpt-4o-mini,None,1389.0,1406.5,913.89,12,22,0.3094,1236.46
16
+ claude-3.5-haiku,None,897.0,1053.16,784.34,10,22,0.2910,911.50
17
+ qwen-2.5-7b-instruct,None,786.95,886.75,618.5,7,19,0.2257,764.07
18
+ llama-3.1-8b-instruct,None,462.0,786.5,606.5,6,17,0.2186,618.30
19
+ jamba-1.5-large,blue,856.0,485.13,466.4,6,26,0.1828,602.51
pages/__init__.py ADDED
File without changes
pages/long_context.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from utils.style import style_long_context
4
+
5
+ @st.cache_data
6
+ def load_data():
7
+ return pd.read_csv("data/long_context.csv")
8
+
9
+ def show():
10
+ st.title("Long Context Leaderboard")
11
+
12
+ # Load and style data
13
+ df = load_data()
14
+ styled_df = style_long_context(df)
15
+
16
+ # Display the dataframe with built-in sort on column click
17
+ st.dataframe(
18
+ styled_df,
19
+ use_container_width=True,
20
+ height=600,
21
+ hide_index=True,
22
+ column_config={
23
+ "Model": st.column_config.TextColumn(width="large"),
24
+ "8K": st.column_config.NumberColumn(format="%.2f"),
25
+ "16K": st.column_config.NumberColumn(format="%.2f"),
26
+ "32K": st.column_config.NumberColumn(format="%.2f"),
27
+ "Average↑": st.column_config.NumberColumn(
28
+ format="%.2f",
29
+ help="Average across all context lengths"
30
+ )
31
+ }
32
+ )
33
+
34
+ # Optionally, keep some explanatory text
35
+ st.markdown("""
36
+ **Context Lengths**:
37
+ - 8K: 8,000 tokens
38
+ - 16K: 16,000 tokens
39
+ - 32K: 32,000 tokens
40
+
41
+ **Benchmark Details**:
42
+ - Evaluated on Symbolic, Medium, and Hard subtasks
43
+ - AUC scores aggregated across context lengths
44
+ - Larger context evaluations limited by compute constraints
45
+ - Scores normalized across task complexities
46
+ """)
pages/zero_context.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from utils.style import style_zero_context
4
+
5
+ @st.cache_data
6
+ def load_data():
7
+ df = pd.read_csv("data/zero_context.csv")
8
+ if "Row Color" in df.columns:
9
+ df.drop(columns=["Row Color"], inplace=True)
10
+ return df
11
+
12
+ def show():
13
+ st.title("Zero Context Leaderboard")
14
+ # Load data
15
+ raw_df = load_data()
16
+
17
+ # Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
18
+ styled_df = style_zero_context(raw_df)
19
+
20
+ # Directly show the dataframe
21
+ st.dataframe(
22
+ styled_df,
23
+ use_container_width=True,
24
+ hide_index=True,
25
+ height=800,
26
+ column_config={
27
+ "Model": st.column_config.TextColumn(width="large"),
28
+ "Symbolic": st.column_config.NumberColumn(format="%.2f"),
29
+ "Medium": st.column_config.NumberColumn(format="%.2f"),
30
+ "Hard": st.column_config.NumberColumn(format="%.2f"),
31
+ "1st<50% op": st.column_config.NumberColumn(format="%.0f"),
32
+ "1st<10% op": st.column_config.NumberColumn(format="%.0f"),
33
+ "Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"),
34
+ "Average↑": st.column_config.NumberColumn(
35
+ format="%.2f",
36
+ help="Average across all subsets"
37
+ )
38
+ }
39
+ )
40
+
41
+ # You can leave your explanation/description below
42
+ st.markdown("""
43
+ **Evaluation Criteria:**
44
+ - **AUC Calculation:** ...
45
+ - **Threshold Ops:** ...
46
+ """)
utils/style.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ COLOR_MAP = {
4
+ "yellow": "background-color: #FFFFCC", # Reasoning models
5
+ "green": "background-color: #E3FBE9", # Linear attention hybrid
6
+ "blue": "background-color: #E6F4FF" # SSM hybrid models
7
+ }
8
+
9
+ def style_zero_context(df):
10
+ """
11
+ Similar approach to style_long_context:
12
+ 1) color rows based on model name
13
+ 2) numeric formatting
14
+ """
15
+ import pandas as pd
16
+
17
+ # Example color dict, tweak as needed:
18
+ color_mapping = {
19
+ "minimax-text-01": COLOR_MAP["green"],
20
+ "jamba-1.5-large": COLOR_MAP["blue"],
21
+ "deepseek-r1": COLOR_MAP["yellow"],
22
+ "o1-mini": COLOR_MAP["yellow"],
23
+ "qwq-32b-preview": COLOR_MAP["yellow"],
24
+ # Add any other special-cased models here
25
+ # "o1-mini": COLOR_MAP["yellow"], etc.
26
+ }
27
+ styler = df.style.apply(
28
+ lambda row: [color_mapping.get(row["Model"], "")]*len(row),
29
+ axis=1
30
+ )
31
+
32
+ # # Attach custom tooltips (optional)
33
+ # tooltips = pd.DataFrame("", index=df.index, columns=df.columns)
34
+ # if "1st<50% op" in df.columns:
35
+ # tooltips["1st<50% op"] = "First operation number with accuracy <50%"
36
+ # if "1st<10% op" in df.columns:
37
+ # tooltips["1st<10% op"] = "First operation number with accuracy <10%"
38
+ # if "Avg. Acc op≤30" in df.columns:
39
+ # tooltips["Avg. Acc op≤30"] = "Average accuracy of first 30 operations"
40
+ # styler = styler.set_tooltips(tooltips)
41
+
42
+ # Apply numeric formatting
43
+ styler = styler.format({
44
+ "Symbolic": "{:,.2f}", # Format as number with thousands separator and 1 decimal place
45
+ "Medium": "{:,.2f}", # Format as number with thousands separator and 2 decimal places
46
+ "Hard": "{:,.2f}", # Format as number with thousands separator and 2 decimal places
47
+ "1st<50% op": "{:,.0f}", # Format as plain integer (no decimal places)
48
+ "1st<10% op": "{:,.0f}", # Format as plain integer (no decimal places)
49
+ "Avg. Acc op≤30": "{:.4f}", # Format with 4 decimal places
50
+ "Average↑": "{:,.2f}" # Format as number with thousands separator and 2 decimal places
51
+ })
52
+
53
+
54
+ return styler
55
+ # Add styling for model types
56
+ def style_long_context(df):
57
+ color_mapping = {
58
+ "minimax-text-01": COLOR_MAP["green"],
59
+ "jamba-1.5-large": COLOR_MAP["blue"]
60
+ }
61
+
62
+ return df.style.apply(
63
+ lambda row: [color_mapping.get(row["Model"], "")]*len(row),
64
+ axis=1
65
+ ).format({
66
+ "8K": "{:,.2f}",
67
+ "16K": "{:,.2f}",
68
+ "32K": "{:,.2f}",
69
+ "Average↑": "{:,.2f}"
70
+ })