Spaces:

InfiniAILab
/

GSM-Infinite-Leaderboard

Running

App Files Files Community

atlas5301 commited on Feb 7

Commit

064c454

1 Parent(s): 9648ca4

improve links and style

Browse files

Files changed (5) hide show

data/long_context.csv +11 -11
data/zero_context.csv +19 -19
pages/long_context.py +4 -18
pages/zero_noise.py +1 -21
utils/style.py +44 -8

data/long_context.csv CHANGED Viewed

@@ -1,11 +1,11 @@
-Model,8K,16K,32K,Average↑
-gemini-1.5-pro-002,1182.43,896.31,812.96,963.9
-qwen-2.5-72b-instruct,927.33,681.53,563.65,724.17
-mistral-large-2411,914.49,563.73,319.21,599.14
-deepseek-v3,935.10,477.02,313.66,575.2
-gemini-1.5-flash-002,673.88,476.72,377.38,509.3
-llama-3.1-70b-instruct,479.00,394.50,355.5,409.67
-minimax-text-01,481.32,359.56,325.95,388.94
-gpt-4o-mini,401.00,337.81,275.63,338.15
-qwen-2.5-7b-instruct,248.00,211.50,196.17,218.56
-llama-3.1-8b-instruct,183.67,149.50,109.45,147.54

+Model,8K,16K,32K,Average↑,Link
+gemini-1.5-pro-002,1182.43,896.31,812.96,963.9,https://aistudio.google.com/app/prompts/new_chat?model=gemini-1.5-pro-002
+qwen-2.5-72b-instruct,927.33,681.53,563.65,724.17,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct
+mistral-large-2411,914.49,563.73,319.21,599.14,https://huggingface.co/mistralai/Mistral-Large-Instruct-2411
+deepseek-v3,935.10,477.02,313.66,575.2,https://huggingface.co/deepseek-ai/DeepSeek-V3
+gemini-1.5-flash-002,673.88,476.72,377.38,509.3,https://aistudio.google.com/app/prompts/new_chat?model=gemini-1.5-flash-002
+llama-3.1-70b-instruct,479.00,394.50,355.5,409.67,https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+minimax-text-01,481.32,359.56,325.95,388.94,https://huggingface.co/MiniMaxAI/MiniMax-Text-01
+gpt-4o-mini,401.00,337.81,275.63,338.15,https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/
+qwen-2.5-7b-instruct,248.00,211.50,196.17,218.56,https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+llama-3.1-8b-instruct,183.67,149.50,109.45,147.54,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct

data/zero_context.csv CHANGED Viewed

@@ -1,19 +1,19 @@
-Model,Row Color,Symbolic,Medium,Hard,1st<50% op,1st<10% op,Avg. Acc op≤30,Average↑
-deepseek-r1,yellow,7280.0,9750.85,8573.8,100,130,0.9427,8534.88
-o1-mini,yellow,5060.0,6054.91,3738.43,50,90,0.8397,4951.11
-deepseek-v3,None,4310.0,4100.81,2407.86,24,55,0.6669,3606.22
-qwq-32b-preview,yellow,3530.0,3205.75,1846.19,21,50,0.5403,2860.65
-gemini-1.5-pro-002,None,2547.0,3659.59,2318.28,26,45,0.6924,2841.62
-claude-3.5-sonnet,None,2161.0,3281.8,2115.79,26,40,0.6758,2519.53
-mistral-large-2411,None,2332.5,2879.92,2310.49,24,50,0.6645,2507.64
-qwen-2.5-72b-instruct,None,2048.0,2496.81,2016.38,21,40,0.5433,2187.06
-gpt-4o-2024-11-20,None,2379.0,2457.37,1451.54,18,30,0.5064,2095.97
-gemini-1.5-flash-002,None,1970.0,1478.75,1274.25,13,30,0.4460,1574.33
-llama-3.1-70b-instruct,None,1769.0,1650.25,1205.25,15,30,0.4314,1541.50
-minimax-text-01,green,1618.5,1712.64,1178.51,14,30,0.4213,1503.22
-llama-3.1-405b-instruct,None,1557.0,1321.54,950.0,11,20,0.3409,1276.18
-gpt-4o-mini,None,1389.0,1406.5,913.89,12,22,0.3094,1236.46
-claude-3.5-haiku,None,897.0,1053.16,784.34,10,22,0.2910,911.50
-qwen-2.5-7b-instruct,None,786.95,886.75,618.5,7,19,0.2257,764.07
-llama-3.1-8b-instruct,None,462.0,786.5,606.5,6,17,0.2186,618.30
-jamba-1.5-large,blue,856.0,485.13,466.4,6,26,0.1828,602.51

+Model,Row Color,Symbolic,Medium,Hard,1st<50% op,1st<10% op,Avg. Acc op≤30,Average↑,Link
+deepseek-r1,yellow,7280.0,9750.85,8573.8,100,130,0.9427,8534.88,https://huggingface.co/deepseek-ai/DeepSeek-V3
+o1-mini,yellow,5060.0,6054.91,3738.43,50,90,0.8397,4951.11,https://platform.openai.com/docs/models/o1
+deepseek-v3,None,4310.0,4100.81,2407.86,24,55,0.6669,3606.22,https://huggingface.co/deepseek-ai/DeepSeek-V3
+qwq-32b-preview,yellow,3530.0,3205.75,1846.19,21,50,0.5403,2860.65,https://huggingface.co/Qwen/QwQ-32B-Preview
+gemini-1.5-pro-002,None,2547.0,3659.59,2318.28,26,45,0.6924,2841.62,https://aistudio.google.com/app/prompts/new_chat?model=gemini-1.5-pro-002
+claude-3.5-sonnet,None,2161.0,3281.8,2115.79,26,40,0.6758,2519.53,https://www.anthropic.com/news/3-5-models-and-computer-use
+mistral-large-2411,None,2332.5,2879.92,2310.49,24,50,0.6645,2507.64,https://huggingface.co/mistralai/Mistral-Large-Instruct-2411
+qwen-2.5-72b-instruct,None,2048.0,2496.81,2016.38,21,40,0.5433,2187.06,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct
+gpt-4o-2024-11-20,None,2379.0,2457.37,1451.54,18,30,0.5064,2095.97,https://platform.openai.com/docs/models/gpt-4o#gpt-4o
+gemini-1.5-flash-002,None,1970.0,1478.75,1274.25,13,30,0.4460,1574.33,https://aistudio.google.com/app/prompts/new_chat?model=gemini-1.5-flash-002
+llama-3.1-70b-instruct,None,1769.0,1650.25,1205.25,15,30,0.4314,1541.50,https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct
+minimax-text-01,green,1618.5,1712.64,1178.51,14,30,0.4213,1503.22,https://huggingface.co/MiniMaxAI/MiniMax-Text-01
+llama-3.1-405b-instruct,None,1557.0,1321.54,950.0,11,20,0.3409,1276.18,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct
+gpt-4o-mini,None,1389.0,1406.5,913.89,12,22,0.3094,1236.46,https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/
+claude-3.5-haiku,None,897.0,1053.16,784.34,10,22,0.2910,911.50,https://www.anthropic.com/news/3-5-models-and-computer-use
+qwen-2.5-7b-instruct,None,786.95,886.75,618.5,7,19,0.2257,764.07,https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+llama-3.1-8b-instruct,None,462.0,786.5,606.5,6,17,0.2186,618.30,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
+jamba-1.5-large,blue,856.0,485.13,466.4,6,26,0.1828,602.51,https://huggingface.co/ai21labs/AI21-Jamba-1.5-Large

pages/long_context.py CHANGED Viewed

@@ -10,28 +10,14 @@ def load_data():
 def show():
     st.title("Long Context Leaderboard")
     # Load and style data
     df = load_data()
     styled_df = style_long_context(df)
-    # Display the dataframe with built-in sort on column click
-    st.dataframe(
-        styled_df,
-        use_container_width=True,
-        height=35*(len(df)+1),
-        hide_index=True,
-        column_config={
-            "Model": st.column_config.TextColumn(width="large"),
-            "8K": st.column_config.NumberColumn(format="%.2f"),
-            "16K": st.column_config.NumberColumn(format="%.2f"),
-            "32K": st.column_config.NumberColumn(format="%.2f"),
-            "Average↑": st.column_config.NumberColumn(
-                format="%.2f",
-                help="Average across all context lengths"
-            )
-        }
-    )
     # Optionally, keep some explanatory text
     st.markdown("""

 def show():
     st.title("Long Context Leaderboard")
     # Load and style data
     df = load_data()
     styled_df = style_long_context(df)
+    st.markdown(styled_df, unsafe_allow_html=True) # No need to call to_html() again
+    # st.dataframe(styled_df, use_container_width=True)
+    # st.html(styled_df)
     # Optionally, keep some explanatory text
     st.markdown("""

pages/zero_noise.py CHANGED Viewed

@@ -16,27 +16,7 @@ def show():
     # Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
     styled_df = style_zero_context(raw_df)
-    # Directly show the dataframe
-    st.dataframe(
-        styled_df,
-        use_container_width=True,
-        hide_index=True,
-        height=35*(1+len(raw_df)),
-        column_config={
-            "Model": st.column_config.TextColumn(width="large"),
-            "Symbolic": st.column_config.NumberColumn(format="%.2f"),
-            "Medium": st.column_config.NumberColumn(format="%.2f"),
-            "Hard": st.column_config.NumberColumn(format="%.2f"),
-            "1st<50% op": st.column_config.NumberColumn(format="%.0f"),
-            "1st<10% op": st.column_config.NumberColumn(format="%.0f"),
-            "Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"),
-            "Average↑": st.column_config.NumberColumn(
-                format="%.2f",
-                help="Average across all subsets"
-            )
-        }
-    )
     # You can leave your explanation/description below
     st.markdown("""

     # Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
     styled_df = style_zero_context(raw_df)
+    st.markdown(styled_df, unsafe_allow_html=True) # No need to call to_html() again
     # You can leave your explanation/description below
     st.markdown("""

utils/style.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import pandas as pd
 import streamlit as st
 # Define color maps for both light and dark modes
 COLOR_MAP = {
     "light": {
-        "yellow": "background-color: rgba(255, 255, 204, 0.5)",  # Reasoning models
-        "green": "background-color: rgba(227, 251, 233, 0.5)",   # Linear attention hybrid
-        "blue": "background-color: rgba(230, 244, 255, 0.5)"     # SSM hybrid models
     },
 }
@@ -31,8 +32,12 @@ def style_zero_context(df):
         # Add any other special-cased models here
         # "o1-mini": COLOR_MAP["yellow"], etc.
     }
     styler = df.style.apply(
-        lambda row: [color_mapping.get(row["Model"], "")]*len(row),
         axis=1
     )
@@ -57,6 +62,19 @@ def style_zero_context(df):
         "Average↑": "{:,.2f}"      # Format as number with thousands separator and 2 decimal places
     })
     return styler
 # Add styling for model types
@@ -65,13 +83,31 @@ def style_long_context(df):
         "minimax-text-01": get_color_map()["green"],
         "jamba-1.5-large": get_color_map()["blue"]
     }
-    return df.style.apply(
-        lambda row: [color_mapping.get(row["Model"], "")]*len(row),
         axis=1
     ).format({
         "8K": "{:,.2f}",
         "16K": "{:,.2f}",
         "32K": "{:,.2f}",
-        "Average↑": "{:,.2f}"
-    })

 import pandas as pd
 import streamlit as st
+import re
 # Define color maps for both light and dark modes
 COLOR_MAP = {
     "light": {
+        "yellow": "background-color: rgba(255, 255, 128, 0.3)",  # Reasoning models
+        "green": "background-color: rgba(192, 255, 192, 0.3)",   # Linear attention hybrid
+        "blue": "background-color: rgba(192, 192, 255, 0.3)"     # SSM hybrid models
     },
 }
         # Add any other special-cased models here
         # "o1-mini": COLOR_MAP["yellow"], etc.
     }
+    # Add links to model names
+    df["Model"] = df.apply(lambda row: f'<a href="{row["Link"]}" target="_blank">{row["Model"]}</a>', axis=1)
+    df.drop(columns=["Link"], inplace=True)
     styler = df.style.apply(
+        lambda row: [color_mapping.get(re.sub(r'<[^>]+>', '', row["Model"]), "")]*len(row),
         axis=1
     )
         "Average↑": "{:,.2f}"      # Format as number with thousands separator and 2 decimal places
     })
+    html = styler.to_html(escape=False, index=False)
+    # Updated regex: target model name *before* link replacement and use word boundary
+    for model, style in color_mapping.items():
+        html = re.sub(rf'<tr[^>]*>\s*<td[^>]*>{re.escape(model)}<', rf'<tr style="{style}"><td>', html, re.M)
+    html = re.sub(
+        r'<table(.*?)>',
+        r'<table\1 style="width:100%; border-collapse:collapse;">',
+        html
+    )
+    return html # Return the modified HTML
     return styler
 # Add styling for model types
         "minimax-text-01": get_color_map()["green"],
         "jamba-1.5-large": get_color_map()["blue"]
     }
+    df["Model"] = df.apply(lambda row: f'<a href="{row["Link"]}" target="_blank">{row["Model"]}</a>', axis=1)
+    df.drop(columns=["Link"], inplace=True)
+    styled_df = df.style.apply(
+        lambda row: [color_mapping.get(re.sub(r'<[^>]+>', '', row["Model"]), "")]*len(row),
         axis=1
     ).format({
         "8K": "{:,.2f}",
         "16K": "{:,.2f}",
         "32K": "{:,.2f}",
+        "Average↑": "{:,.2f}",
+    })
+    # Convert to HTML and add <a> tags
+    html = styled_df.to_html(escape=False, index=False)
+    # Updated regex: target model name *before* link replacement and use word boundary
+    for model, style in color_mapping.items():
+        html = re.sub(rf'<tr[^>]*>\s*<td[^>]*>{re.escape(model)}<', rf'<tr style="{style}"><td>', html, re.M)
+    html = re.sub(
+        r'<table(.*?)>',
+        r'<table\1 style="width:100%; border-collapse:collapse;">',
+        html
+    )
+    return html # Return the modified HTML