atlas5301
commited on
Commit
·
064c454
1
Parent(s):
9648ca4
improve links and style
Browse files- data/long_context.csv +11 -11
- data/zero_context.csv +19 -19
- pages/long_context.py +4 -18
- pages/zero_noise.py +1 -21
- utils/style.py +44 -8
data/long_context.csv
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
Model,8K,16K,32K,Average
|
2 |
-
gemini-1.5-pro-002,1182.43,896.31,812.96,963.9
|
3 |
-
qwen-2.5-72b-instruct,927.33,681.53,563.65,724.17
|
4 |
-
mistral-large-2411,914.49,563.73,319.21,599.14
|
5 |
-
deepseek-v3,935.10,477.02,313.66,575.2
|
6 |
-
gemini-1.5-flash-002,673.88,476.72,377.38,509.3
|
7 |
-
llama-3.1-70b-instruct,479.00,394.50,355.5,409.67
|
8 |
-
minimax-text-01,481.32,359.56,325.95,388.94
|
9 |
-
gpt-4o-mini,401.00,337.81,275.63,338.15
|
10 |
-
qwen-2.5-7b-instruct,248.00,211.50,196.17,218.56
|
11 |
-
llama-3.1-8b-instruct,183.67,149.50,109.45,147.54
|
|
|
1 |
+
Model,8K,16K,32K,Average↑,Link
|
2 |
+
gemini-1.5-pro-002,1182.43,896.31,812.96,963.9,https://aistudio.google.com/app/prompts/new_chat?model=gemini-1.5-pro-002
|
3 |
+
qwen-2.5-72b-instruct,927.33,681.53,563.65,724.17,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct
|
4 |
+
mistral-large-2411,914.49,563.73,319.21,599.14,https://huggingface.co/mistralai/Mistral-Large-Instruct-2411
|
5 |
+
deepseek-v3,935.10,477.02,313.66,575.2,https://huggingface.co/deepseek-ai/DeepSeek-V3
|
6 |
+
gemini-1.5-flash-002,673.88,476.72,377.38,509.3,https://aistudio.google.com/app/prompts/new_chat?model=gemini-1.5-flash-002
|
7 |
+
llama-3.1-70b-instruct,479.00,394.50,355.5,409.67,https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
8 |
+
minimax-text-01,481.32,359.56,325.95,388.94,https://huggingface.co/MiniMaxAI/MiniMax-Text-01
|
9 |
+
gpt-4o-mini,401.00,337.81,275.63,338.15,https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/
|
10 |
+
qwen-2.5-7b-instruct,248.00,211.50,196.17,218.56,https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
|
11 |
+
llama-3.1-8b-instruct,183.67,149.50,109.45,147.54,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
|
data/zero_context.csv
CHANGED
@@ -1,19 +1,19 @@
|
|
1 |
-
Model,Row Color,Symbolic,Medium,Hard,1st<50% op,1st<10% op,Avg. Acc op≤30,Average
|
2 |
-
deepseek-r1,yellow,7280.0,9750.85,8573.8,100,130,0.9427,8534.88
|
3 |
-
o1-mini,yellow,5060.0,6054.91,3738.43,50,90,0.8397,4951.11
|
4 |
-
deepseek-v3,None,4310.0,4100.81,2407.86,24,55,0.6669,3606.22
|
5 |
-
qwq-32b-preview,yellow,3530.0,3205.75,1846.19,21,50,0.5403,2860.65
|
6 |
-
gemini-1.5-pro-002,None,2547.0,3659.59,2318.28,26,45,0.6924,2841.62
|
7 |
-
claude-3.5-sonnet,None,2161.0,3281.8,2115.79,26,40,0.6758,2519.53
|
8 |
-
mistral-large-2411,None,2332.5,2879.92,2310.49,24,50,0.6645,2507.64
|
9 |
-
qwen-2.5-72b-instruct,None,2048.0,2496.81,2016.38,21,40,0.5433,2187.06
|
10 |
-
gpt-4o-2024-11-20,None,2379.0,2457.37,1451.54,18,30,0.5064,2095.97
|
11 |
-
gemini-1.5-flash-002,None,1970.0,1478.75,1274.25,13,30,0.4460,1574.33
|
12 |
-
llama-3.1-70b-instruct,None,1769.0,1650.25,1205.25,15,30,0.4314,1541.50
|
13 |
-
minimax-text-01,green,1618.5,1712.64,1178.51,14,30,0.4213,1503.22
|
14 |
-
llama-3.1-405b-instruct,None,1557.0,1321.54,950.0,11,20,0.3409,1276.18
|
15 |
-
gpt-4o-mini,None,1389.0,1406.5,913.89,12,22,0.3094,1236.46
|
16 |
-
claude-3.5-haiku,None,897.0,1053.16,784.34,10,22,0.2910,911.50
|
17 |
-
qwen-2.5-7b-instruct,None,786.95,886.75,618.5,7,19,0.2257,764.07
|
18 |
-
llama-3.1-8b-instruct,None,462.0,786.5,606.5,6,17,0.2186,618.30
|
19 |
-
jamba-1.5-large,blue,856.0,485.13,466.4,6,26,0.1828,602.51
|
|
|
1 |
+
Model,Row Color,Symbolic,Medium,Hard,1st<50% op,1st<10% op,Avg. Acc op≤30,Average↑,Link
|
2 |
+
deepseek-r1,yellow,7280.0,9750.85,8573.8,100,130,0.9427,8534.88,https://huggingface.co/deepseek-ai/DeepSeek-V3
|
3 |
+
o1-mini,yellow,5060.0,6054.91,3738.43,50,90,0.8397,4951.11,https://platform.openai.com/docs/models/o1
|
4 |
+
deepseek-v3,None,4310.0,4100.81,2407.86,24,55,0.6669,3606.22,https://huggingface.co/deepseek-ai/DeepSeek-V3
|
5 |
+
qwq-32b-preview,yellow,3530.0,3205.75,1846.19,21,50,0.5403,2860.65,https://huggingface.co/Qwen/QwQ-32B-Preview
|
6 |
+
gemini-1.5-pro-002,None,2547.0,3659.59,2318.28,26,45,0.6924,2841.62,https://aistudio.google.com/app/prompts/new_chat?model=gemini-1.5-pro-002
|
7 |
+
claude-3.5-sonnet,None,2161.0,3281.8,2115.79,26,40,0.6758,2519.53,https://www.anthropic.com/news/3-5-models-and-computer-use
|
8 |
+
mistral-large-2411,None,2332.5,2879.92,2310.49,24,50,0.6645,2507.64,https://huggingface.co/mistralai/Mistral-Large-Instruct-2411
|
9 |
+
qwen-2.5-72b-instruct,None,2048.0,2496.81,2016.38,21,40,0.5433,2187.06,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct
|
10 |
+
gpt-4o-2024-11-20,None,2379.0,2457.37,1451.54,18,30,0.5064,2095.97,https://platform.openai.com/docs/models/gpt-4o#gpt-4o
|
11 |
+
gemini-1.5-flash-002,None,1970.0,1478.75,1274.25,13,30,0.4460,1574.33,https://aistudio.google.com/app/prompts/new_chat?model=gemini-1.5-flash-002
|
12 |
+
llama-3.1-70b-instruct,None,1769.0,1650.25,1205.25,15,30,0.4314,1541.50,https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
|
13 |
+
minimax-text-01,green,1618.5,1712.64,1178.51,14,30,0.4213,1503.22,https://huggingface.co/MiniMaxAI/MiniMax-Text-01
|
14 |
+
llama-3.1-405b-instruct,None,1557.0,1321.54,950.0,11,20,0.3409,1276.18,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
|
15 |
+
gpt-4o-mini,None,1389.0,1406.5,913.89,12,22,0.3094,1236.46,https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/
|
16 |
+
claude-3.5-haiku,None,897.0,1053.16,784.34,10,22,0.2910,911.50,https://www.anthropic.com/news/3-5-models-and-computer-use
|
17 |
+
qwen-2.5-7b-instruct,None,786.95,886.75,618.5,7,19,0.2257,764.07,https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
|
18 |
+
llama-3.1-8b-instruct,None,462.0,786.5,606.5,6,17,0.2186,618.30,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
|
19 |
+
jamba-1.5-large,blue,856.0,485.13,466.4,6,26,0.1828,602.51,https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large
|
pages/long_context.py
CHANGED
@@ -10,28 +10,14 @@ def load_data():
|
|
10 |
|
11 |
def show():
|
12 |
st.title("Long Context Leaderboard")
|
13 |
-
|
14 |
# Load and style data
|
15 |
df = load_data()
|
16 |
styled_df = style_long_context(df)
|
17 |
|
18 |
-
|
19 |
-
st.dataframe(
|
20 |
-
|
21 |
-
|
22 |
-
height=35*(len(df)+1),
|
23 |
-
hide_index=True,
|
24 |
-
column_config={
|
25 |
-
"Model": st.column_config.TextColumn(width="large"),
|
26 |
-
"8K": st.column_config.NumberColumn(format="%.2f"),
|
27 |
-
"16K": st.column_config.NumberColumn(format="%.2f"),
|
28 |
-
"32K": st.column_config.NumberColumn(format="%.2f"),
|
29 |
-
"Average↑": st.column_config.NumberColumn(
|
30 |
-
format="%.2f",
|
31 |
-
help="Average across all context lengths"
|
32 |
-
)
|
33 |
-
}
|
34 |
-
)
|
35 |
|
36 |
# Optionally, keep some explanatory text
|
37 |
st.markdown("""
|
|
|
10 |
|
11 |
def show():
|
12 |
st.title("Long Context Leaderboard")
|
|
|
13 |
# Load and style data
|
14 |
df = load_data()
|
15 |
styled_df = style_long_context(df)
|
16 |
|
17 |
+
st.markdown(styled_df, unsafe_allow_html=True) # No need to call to_html() again
|
18 |
+
# st.dataframe(styled_df, use_container_width=True)
|
19 |
+
|
20 |
+
# st.html(styled_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
# Optionally, keep some explanatory text
|
23 |
st.markdown("""
|
pages/zero_noise.py
CHANGED
@@ -16,27 +16,7 @@ def show():
|
|
16 |
|
17 |
# Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
|
18 |
styled_df = style_zero_context(raw_df)
|
19 |
-
|
20 |
-
# Directly show the dataframe
|
21 |
-
st.dataframe(
|
22 |
-
styled_df,
|
23 |
-
use_container_width=True,
|
24 |
-
hide_index=True,
|
25 |
-
height=35*(1+len(raw_df)),
|
26 |
-
column_config={
|
27 |
-
"Model": st.column_config.TextColumn(width="large"),
|
28 |
-
"Symbolic": st.column_config.NumberColumn(format="%.2f"),
|
29 |
-
"Medium": st.column_config.NumberColumn(format="%.2f"),
|
30 |
-
"Hard": st.column_config.NumberColumn(format="%.2f"),
|
31 |
-
"1st<50% op": st.column_config.NumberColumn(format="%.0f"),
|
32 |
-
"1st<10% op": st.column_config.NumberColumn(format="%.0f"),
|
33 |
-
"Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"),
|
34 |
-
"Average↑": st.column_config.NumberColumn(
|
35 |
-
format="%.2f",
|
36 |
-
help="Average across all subsets"
|
37 |
-
)
|
38 |
-
}
|
39 |
-
)
|
40 |
|
41 |
# You can leave your explanation/description below
|
42 |
st.markdown("""
|
|
|
16 |
|
17 |
# Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
|
18 |
styled_df = style_zero_context(raw_df)
|
19 |
+
st.markdown(styled_df, unsafe_allow_html=True) # No need to call to_html() again
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# You can leave your explanation/description below
|
22 |
st.markdown("""
|
utils/style.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
import pandas as pd
|
2 |
import streamlit as st
|
|
|
3 |
# Define color maps for both light and dark modes
|
4 |
COLOR_MAP = {
|
5 |
"light": {
|
6 |
-
"yellow": "background-color: rgba(255, 255,
|
7 |
-
"green": "background-color: rgba(
|
8 |
-
"blue": "background-color: rgba(
|
9 |
},
|
10 |
}
|
11 |
|
@@ -31,8 +32,12 @@ def style_zero_context(df):
|
|
31 |
# Add any other special-cased models here
|
32 |
# "o1-mini": COLOR_MAP["yellow"], etc.
|
33 |
}
|
|
|
|
|
|
|
|
|
34 |
styler = df.style.apply(
|
35 |
-
lambda row: [color_mapping.get(row["Model"], "")]*len(row),
|
36 |
axis=1
|
37 |
)
|
38 |
|
@@ -57,6 +62,19 @@ def style_zero_context(df):
|
|
57 |
"Average↑": "{:,.2f}" # Format as number with thousands separator and 2 decimal places
|
58 |
})
|
59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
return styler
|
62 |
# Add styling for model types
|
@@ -65,13 +83,31 @@ def style_long_context(df):
|
|
65 |
"minimax-text-01": get_color_map()["green"],
|
66 |
"jamba-1.5-large": get_color_map()["blue"]
|
67 |
}
|
|
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
lambda row: [color_mapping.get(row["Model"], "")]*len(row),
|
71 |
axis=1
|
72 |
).format({
|
73 |
"8K": "{:,.2f}",
|
74 |
"16K": "{:,.2f}",
|
75 |
"32K": "{:,.2f}",
|
76 |
-
"Average↑": "{:,.2f}"
|
77 |
-
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import streamlit as st
|
3 |
+
import re
|
4 |
# Define color maps for both light and dark modes
|
5 |
COLOR_MAP = {
|
6 |
"light": {
|
7 |
+
"yellow": "background-color: rgba(255, 255, 128, 0.3)", # Reasoning models
|
8 |
+
"green": "background-color: rgba(192, 255, 192, 0.3)", # Linear attention hybrid
|
9 |
+
"blue": "background-color: rgba(192, 192, 255, 0.3)" # SSM hybrid models
|
10 |
},
|
11 |
}
|
12 |
|
|
|
32 |
# Add any other special-cased models here
|
33 |
# "o1-mini": COLOR_MAP["yellow"], etc.
|
34 |
}
|
35 |
+
# Add links to model names
|
36 |
+
df["Model"] = df.apply(lambda row: f'<a href="{row["Link"]}" target="_blank">{row["Model"]}</a>', axis=1)
|
37 |
+
df.drop(columns=["Link"], inplace=True)
|
38 |
+
|
39 |
styler = df.style.apply(
|
40 |
+
lambda row: [color_mapping.get(re.sub(r'<[^>]+>', '', row["Model"]), "")]*len(row),
|
41 |
axis=1
|
42 |
)
|
43 |
|
|
|
62 |
"Average↑": "{:,.2f}" # Format as number with thousands separator and 2 decimal places
|
63 |
})
|
64 |
|
65 |
+
html = styler.to_html(escape=False, index=False)
|
66 |
+
|
67 |
+
# Updated regex: target model name *before* link replacement and use word boundary
|
68 |
+
for model, style in color_mapping.items():
|
69 |
+
html = re.sub(rf'<tr[^>]*>\s*<td[^>]*>{re.escape(model)}<', rf'<tr style="{style}"><td>', html, re.M)
|
70 |
+
|
71 |
+
html = re.sub(
|
72 |
+
r'<table(.*?)>',
|
73 |
+
r'<table\1 style="width:100%; border-collapse:collapse;">',
|
74 |
+
html
|
75 |
+
)
|
76 |
+
return html # Return the modified HTML
|
77 |
+
|
78 |
|
79 |
return styler
|
80 |
# Add styling for model types
|
|
|
83 |
"minimax-text-01": get_color_map()["green"],
|
84 |
"jamba-1.5-large": get_color_map()["blue"]
|
85 |
}
|
86 |
+
|
87 |
+
df["Model"] = df.apply(lambda row: f'<a href="{row["Link"]}" target="_blank">{row["Model"]}</a>', axis=1)
|
88 |
+
df.drop(columns=["Link"], inplace=True)
|
89 |
|
90 |
+
styled_df = df.style.apply(
|
91 |
+
lambda row: [color_mapping.get(re.sub(r'<[^>]+>', '', row["Model"]), "")]*len(row),
|
92 |
axis=1
|
93 |
).format({
|
94 |
"8K": "{:,.2f}",
|
95 |
"16K": "{:,.2f}",
|
96 |
"32K": "{:,.2f}",
|
97 |
+
"Average↑": "{:,.2f}",
|
98 |
+
})
|
99 |
+
|
100 |
+
# Convert to HTML and add <a> tags
|
101 |
+
html = styled_df.to_html(escape=False, index=False)
|
102 |
+
|
103 |
+
# Updated regex: target model name *before* link replacement and use word boundary
|
104 |
+
for model, style in color_mapping.items():
|
105 |
+
html = re.sub(rf'<tr[^>]*>\s*<td[^>]*>{re.escape(model)}<', rf'<tr style="{style}"><td>', html, re.M)
|
106 |
+
|
107 |
+
html = re.sub(
|
108 |
+
r'<table(.*?)>',
|
109 |
+
r'<table\1 style="width:100%; border-collapse:collapse;">',
|
110 |
+
html
|
111 |
+
)
|
112 |
+
|
113 |
+
return html # Return the modified HTML
|