|
import streamlit as st |
|
import pandas as pd |
|
import plotly.graph_objects as go |
|
import numpy as np |
|
|
|
def show(): |
|
st.title("Benchmark Results Dashboard") |
|
|
|
@st.cache_data |
|
def load_data(): |
|
"""Load and process benchmark results, handling zero accuracy.""" |
|
try: |
|
df = pd.read_csv('data/processed_results.csv') |
|
except FileNotFoundError: |
|
st.error("File 'processed_results.csv' not found.") |
|
st.stop() |
|
|
|
epsilon = 1e-6 |
|
num_zero_acc = (df['accuracy'] <= 0).sum() |
|
if num_zero_acc > 0: |
|
|
|
df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon |
|
|
|
df['log_accuracy'] = np.log(df['accuracy']) |
|
return df |
|
|
|
df = load_data() |
|
|
|
|
|
st.header("Filters") |
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
datasets = df['dataset'].unique() |
|
selected_datasets = st.multiselect("Dataset(s)", datasets, default=['symbolic']) |
|
|
|
filtered_df = df[df['dataset'].isin(selected_datasets)] |
|
|
|
lengths = sorted(filtered_df['length'].unique()) |
|
|
|
disabled = not selected_datasets |
|
selected_lengths = st.multiselect("Length(s)", lengths, default=[0] if not disabled and 0 in lengths else [], disabled=disabled) |
|
|
|
|
|
with col2: |
|
|
|
available_models = filtered_df['model'].unique() |
|
selected_models = st.multiselect("Model(s)", available_models, default=['llama-3.1-8b-instruct','qwen-2.5-7b-instruct','llama-3.1-70b-instruct','qwen-2.5-72b-instruct','deepseek-v3']) |
|
|
|
with col3: |
|
min_op, max_op = st.slider("Op Range", int(filtered_df['op'].min()), int(filtered_df['op'].max()), (int(filtered_df['op'].min()), int(filtered_df['op'].max()))) |
|
min_acc, max_acc = st.slider("Accuracy Range", float(filtered_df['accuracy'].min()), float(filtered_df['accuracy'].max()), (float(filtered_df['accuracy'].min()), float(filtered_df['accuracy'].max()))) |
|
|
|
|
|
filtered_df = filtered_df[filtered_df['model'].isin(selected_models) & filtered_df['length'].isin(selected_lengths)] |
|
|
|
|
|
filtered_df = filtered_df[(filtered_df['op'] >= min_op) & (filtered_df['op'] <= max_op) & (filtered_df['accuracy'] >= min_acc) & (filtered_df['accuracy'] <= max_acc)] |
|
|
|
|
|
if filtered_df.empty: |
|
st.warning("No data for selected filters.") |
|
st.stop() |
|
|
|
def plot_data(filtered_df, selected_models, selected_lengths, log_scale=False): |
|
"""Plots data vs N, showing different datasets for the same model.""" |
|
fig = go.Figure() |
|
|
|
for model in selected_models: |
|
for length in selected_lengths: |
|
for dataset in filtered_df['dataset'].unique(): |
|
subset_df = filtered_df[(filtered_df['model'] == model) & (filtered_df['length'] == length) & (filtered_df['dataset'] == dataset)] |
|
if not subset_df.empty: |
|
y_data = subset_df['log_accuracy'] if log_scale else subset_df['accuracy'] |
|
fig.add_trace(go.Scatter( |
|
x=subset_df['op'], |
|
y=y_data, |
|
mode='lines+markers', |
|
name=f'{model} Length {length} ({dataset})', |
|
marker=dict(size=6) |
|
)) |
|
|
|
y_title = "Log(Accuracy)" if log_scale else "Accuracy" |
|
fig.update_layout(title=f"{y_title} vs Op", xaxis_title="Op", yaxis_title=y_title, width=800, height=600) |
|
return fig |
|
|
|
view_option = st.radio("View", ["Accuracy", "Log(Accuracy)"]) |
|
|
|
if view_option == "Accuracy": |
|
fig = plot_data(filtered_df, selected_models, selected_lengths, log_scale=False) |
|
else: |
|
fig = plot_data(filtered_df, selected_models, selected_lengths, log_scale=True) |
|
|
|
st.plotly_chart(fig, use_container_width=True) |
|
|
|
if st.checkbox("Show Data Table"): |
|
st.subheader("Filtered Data") |
|
st.write(filtered_df) |