Spaces:

InfiniAILab
/

GSM-Infinite-Leaderboard

Running

App Files Files Community

atlas5301 commited on Feb 5

Commit

87248a0

1 Parent(s): 0a71821

initial commit

Browse files

Files changed (9) hide show

.gitignore +1 -0
README.md +0 -12
app.py +26 -2
data/long_context.csv +11 -0
data/zero_context.csv +19 -0
pages/__init__.py +0 -0
pages/long_context.py +46 -0
pages/zero_context.py +46 -0
utils/style.py +70 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

README.md DELETED Viewed

@@ -1,12 +0,0 @@
----
-title: Leaderboard Example
-emoji: 👀
-colorFrom: pink
-colorTo: indigo
-sdk: streamlit
-sdk_version: 1.42.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,4 +1,28 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+st.set_page_config(
+    page_title="Benchmark Leaderboard",
+    layout="wide",
+    page_icon="📊",
+     initial_sidebar_state="collapsed"  # collapses sidebar by default
+)
+def main():
+    # Create tabs at the top of the page
+    tabs = st.tabs(["Zero Context Leaderboard", "Long Context Leaderboard", "Benchmark Viewer"])
+    # Each "with" block corresponds to a content area for that tab.
+    with tabs[0]:
+        from pages import zero_context
+        zero_context.show()
+    with tabs[1]:
+        from pages import long_context
+        long_context.show()
+    with tabs[2]:
+        from pages import benchmark_viewer
+        benchmark_viewer.show()
+if __name__ == "__main__":
+    main()

data/long_context.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+Model,8K,16K,32K,Average↑
+gemini-1.5-pro-002,1182.43,896.31,812.96,963.9
+qwen-2.5-72b-instruct,927.33,681.53,563.65,724.17
+mistral-large-2411,914.49,563.73,319.21,599.14
+deepseek-v3,935.10,477.02,313.66,575.2
+gemini-1.5-flash-002,673.88,476.72,377.38,509.3
+llama-3.1-70b-instruct,479.00,394.50,355.5,409.67
+minimax-text-01,481.32,359.56,325.95,388.94
+gpt-4o-mini,401.00,337.81,275.63,338.15
+qwen-2.5-7b-instruct,248.00,211.50,196.17,218.56
+llama-3.1-8b-instruct,183.67,149.50,109.45,147.54

data/zero_context.csv ADDED Viewed

	@@ -0,0 +1,19 @@

+Model,Row Color,Symbolic,Medium,Hard,1st<50% op,1st<10% op,Avg. Acc op≤30,Average↑
+deepseek-r1,yellow,7280.0,9750.85,8573.8,100,130,0.9427,8534.88
+o1-mini,yellow,5060.0,6054.91,3738.43,50,90,0.8397,4951.11
+deepseek-v3,None,4310.0,4100.81,2407.86,24,55,0.6669,3606.22
+qwq-32b-preview,yellow,3530.0,3205.75,1846.19,21,50,0.5403,2860.65
+gemini-1.5-pro-002,None,2547.0,3659.59,2318.28,26,45,0.6924,2841.62
+claude-3.5-sonnet,None,2161.0,3281.8,2115.79,26,40,0.6758,2519.53
+mistral-large-2411,None,2332.5,2879.92,2310.49,24,50,0.6645,2507.64
+qwen-2.5-72b-instruct,None,2048.0,2496.81,2016.38,21,40,0.5433,2187.06
+gpt-4o-2024-11-20,None,2379.0,2457.37,1451.54,18,30,0.5064,2095.97
+gemini-1.5-flash-002,None,1970.0,1478.75,1274.25,13,30,0.4460,1574.33
+llama-3.1-70b-instruct,None,1769.0,1650.25,1205.25,15,30,0.4314,1541.50
+minimax-text-01,green,1618.5,1712.64,1178.51,14,30,0.4213,1503.22
+llama-3.1-405b-instruct,None,1557.0,1321.54,950.0,11,20,0.3409,1276.18
+gpt-4o-mini,None,1389.0,1406.5,913.89,12,22,0.3094,1236.46
+claude-3.5-haiku,None,897.0,1053.16,784.34,10,22,0.2910,911.50
+qwen-2.5-7b-instruct,None,786.95,886.75,618.5,7,19,0.2257,764.07
+llama-3.1-8b-instruct,None,462.0,786.5,606.5,6,17,0.2186,618.30
+jamba-1.5-large,blue,856.0,485.13,466.4,6,26,0.1828,602.51

pages/__init__.py ADDED Viewed

File without changes

pages/long_context.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+import pandas as pd
+from utils.style import style_long_context
+@st.cache_data
+def load_data():
+    return pd.read_csv("data/long_context.csv")
+def show():
+    st.title("Long Context Leaderboard")
+    # Load and style data
+    df = load_data()
+    styled_df = style_long_context(df)
+    # Display the dataframe with built-in sort on column click
+    st.dataframe(
+        styled_df,
+        use_container_width=True,
+        height=600,
+        hide_index=True,
+        column_config={
+            "Model": st.column_config.TextColumn(width="large"),
+            "8K": st.column_config.NumberColumn(format="%.2f"),
+            "16K": st.column_config.NumberColumn(format="%.2f"),
+            "32K": st.column_config.NumberColumn(format="%.2f"),
+            "Average↑": st.column_config.NumberColumn(
+                format="%.2f",
+                help="Average across all context lengths"
+            )
+        }
+    )
+    # Optionally, keep some explanatory text
+    st.markdown("""
+    **Context Lengths**:
+    - 8K: 8,000 tokens
+    - 16K: 16,000 tokens
+    - 32K: 32,000 tokens
+    **Benchmark Details**:
+    - Evaluated on Symbolic, Medium, and Hard subtasks
+    - AUC scores aggregated across context lengths
+    - Larger context evaluations limited by compute constraints
+    - Scores normalized across task complexities
+    """)

pages/zero_context.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+import pandas as pd
+from utils.style import style_zero_context
+@st.cache_data
+def load_data():
+    df = pd.read_csv("data/zero_context.csv")
+    if "Row Color" in df.columns:
+        df.drop(columns=["Row Color"], inplace=True)
+    return df
+def show():
+    st.title("Zero Context Leaderboard")
+    # Load data
+    raw_df = load_data()
+    # Remove the manual sorting UI (selectbox, checkboxes) and let st.dataframe handle sorting.
+    styled_df = style_zero_context(raw_df)
+    # Directly show the dataframe
+    st.dataframe(
+        styled_df,
+        use_container_width=True,
+        hide_index=True,
+        height=800,
+        column_config={
+            "Model": st.column_config.TextColumn(width="large"),
+            "Symbolic": st.column_config.NumberColumn(format="%.2f"),
+            "Medium": st.column_config.NumberColumn(format="%.2f"),
+            "Hard": st.column_config.NumberColumn(format="%.2f"),
+            "1st<50% op": st.column_config.NumberColumn(format="%.0f"),
+            "1st<10% op": st.column_config.NumberColumn(format="%.0f"),
+            "Avg. Acc op≤30": st.column_config.NumberColumn(format="%.4f"),
+            "Average↑": st.column_config.NumberColumn(
+                format="%.2f",
+                help="Average across all subsets"
+            )
+        }
+    )
+    # You can leave your explanation/description below
+    st.markdown("""
+    **Evaluation Criteria:**
+    - **AUC Calculation:** ...
+    - **Threshold Ops:** ...
+    """)

utils/style.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import pandas as pd
+COLOR_MAP = {
+    "yellow": "background-color: #FFFFCC",  # Reasoning models
+    "green": "background-color: #E3FBE9",   # Linear attention hybrid
+    "blue": "background-color: #E6F4FF"     # SSM hybrid models
+}
+def style_zero_context(df):
+    """
+    Similar approach to style_long_context:
+    1) color rows based on model name
+    2) numeric formatting
+    """
+    import pandas as pd
+    # Example color dict, tweak as needed:
+    color_mapping = {
+        "minimax-text-01": COLOR_MAP["green"],
+        "jamba-1.5-large": COLOR_MAP["blue"],
+        "deepseek-r1": COLOR_MAP["yellow"],
+        "o1-mini": COLOR_MAP["yellow"],
+        "qwq-32b-preview": COLOR_MAP["yellow"],
+        # Add any other special-cased models here
+        # "o1-mini": COLOR_MAP["yellow"], etc.
+    }
+    styler = df.style.apply(
+        lambda row: [color_mapping.get(row["Model"], "")]*len(row),
+        axis=1
+    )
+    # # Attach custom tooltips (optional)
+    # tooltips = pd.DataFrame("", index=df.index, columns=df.columns)
+    # if "1st<50% op" in df.columns:
+    #     tooltips["1st<50% op"] = "First operation number with accuracy <50%"
+    # if "1st<10% op" in df.columns:
+    #     tooltips["1st<10% op"] = "First operation number with accuracy <10%"
+    # if "Avg. Acc op≤30" in df.columns:
+    #     tooltips["Avg. Acc op≤30"] = "Average accuracy of first 30 operations"
+    # styler = styler.set_tooltips(tooltips)
+    # Apply numeric formatting
+    styler = styler.format({
+        "Symbolic": "{:,.2f}",       # Format as number with thousands separator and 1 decimal place
+        "Medium": "{:,.2f}",        # Format as number with thousands separator and 2 decimal places
+        "Hard": "{:,.2f}",          # Format as number with thousands separator and 2 decimal places
+        "1st<50% op": "{:,.0f}",      # Format as plain integer (no decimal places)
+        "1st<10% op": "{:,.0f}",      # Format as plain integer (no decimal places)
+        "Avg. Acc op≤30": "{:.4f}",  # Format with 4 decimal places
+        "Average↑": "{:,.2f}"      # Format as number with thousands separator and 2 decimal places
+    })
+    return styler
+# Add styling for model types
+def style_long_context(df):
+    color_mapping = {
+        "minimax-text-01": COLOR_MAP["green"],
+        "jamba-1.5-large": COLOR_MAP["blue"]
+    }
+    return df.style.apply(
+        lambda row: [color_mapping.get(row["Model"], "")]*len(row),
+        axis=1
+    ).format({
+        "8K": "{:,.2f}",
+        "16K": "{:,.2f}",
+        "32K": "{:,.2f}",
+        "Average↑": "{:,.2f}"
+    })