GSM-Infinite-Leaderboard / pages /benchmark_viewer.py
atlas5301
update format and adjust name
ccfe614
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
import numpy as np
def show():
st.title("Benchmark Results Dashboard")
@st.cache_data
def load_data():
"""Load and process benchmark results, handling zero accuracy."""
try:
df = pd.read_csv('data/processed_results.csv')
except FileNotFoundError:
st.error("File 'processed_results.csv' not found.")
st.stop()
epsilon = 1e-6
num_zero_acc = (df['accuracy'] <= 0).sum()
if num_zero_acc > 0:
# st.warning(f"Found {num_zero_acc} zero/negative accuracy values. Replacing with {epsilon}.")
df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon
df['log_accuracy'] = np.log(df['accuracy'])
return df
df = load_data()
# Filters
st.header("Filters")
col1, col2, col3 = st.columns(3)
with col1:
datasets = df['dataset'].unique()
selected_datasets = st.multiselect("Dataset(s)", datasets, default=['symbolic']) # Default to 'symbolic'
filtered_df = df[df['dataset'].isin(selected_datasets)]
lengths = sorted(filtered_df['length'].unique())
# Disable length filter if no datasets are selected
disabled = not selected_datasets
selected_lengths = st.multiselect("Length(s)", lengths, default=[0] if not disabled and 0 in lengths else [], disabled=disabled) # Default to 0 if available
with col2:
# Single Model Multiselect (filtered by selected datasets)
available_models = filtered_df['model'].unique()
selected_models = st.multiselect("Model(s)", available_models, default=['llama-3.1-8b-instruct','qwen-2.5-7b-instruct','llama-3.1-70b-instruct','qwen-2.5-72b-instruct','deepseek-v3'])
with col3:
min_op, max_op = st.slider("Op Range", int(filtered_df['op'].min()), int(filtered_df['op'].max()), (int(filtered_df['op'].min()), int(filtered_df['op'].max())))
min_acc, max_acc = st.slider("Accuracy Range", float(filtered_df['accuracy'].min()), float(filtered_df['accuracy'].max()), (float(filtered_df['accuracy'].min()), float(filtered_df['accuracy'].max())))
filtered_df = filtered_df[filtered_df['model'].isin(selected_models) & filtered_df['length'].isin(selected_lengths)]
filtered_df = filtered_df[(filtered_df['op'] >= min_op) & (filtered_df['op'] <= max_op) & (filtered_df['accuracy'] >= min_acc) & (filtered_df['accuracy'] <= max_acc)]
if filtered_df.empty:
st.warning("No data for selected filters.")
st.stop()
def plot_data(filtered_df, selected_models, selected_lengths, log_scale=False):
"""Plots data vs N, showing different datasets for the same model."""
fig = go.Figure()
for model in selected_models:
for length in selected_lengths:
for dataset in filtered_df['dataset'].unique():
subset_df = filtered_df[(filtered_df['model'] == model) & (filtered_df['length'] == length) & (filtered_df['dataset'] == dataset)]
if not subset_df.empty:
y_data = subset_df['log_accuracy'] if log_scale else subset_df['accuracy']
fig.add_trace(go.Scatter(
x=subset_df['op'],
y=y_data,
mode='lines+markers',
name=f'{model} Length {length} ({dataset})',
marker=dict(size=6)
))
y_title = "Log(Accuracy)" if log_scale else "Accuracy"
fig.update_layout(title=f"{y_title} vs Op", xaxis_title="Op", yaxis_title=y_title, width=800, height=600)
return fig
view_option = st.radio("View", ["Accuracy", "Log(Accuracy)"])
if view_option == "Accuracy":
fig = plot_data(filtered_df, selected_models, selected_lengths, log_scale=False)
else: # Log(Accuracy)
fig = plot_data(filtered_df, selected_models, selected_lengths, log_scale=True)
st.plotly_chart(fig, use_container_width=True)
if st.checkbox("Show Data Table"):
st.subheader("Filtered Data")
st.write(filtered_df)