atlas5301 commited on
Commit
a1ac14e
·
1 Parent(s): 93c1867

release full benchmark viewer

Browse files
data/processed_results.csv ADDED
The diff for this file is too large to render. See raw diff
 
pages/benchmark_viewer.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.graph_objects as go
4
+ import numpy as np
5
+
6
+ def show():
7
+ st.title("Benchmark Results Dashboard")
8
+
9
+ @st.cache_data
10
+ def load_data():
11
+ """Load and process benchmark results, handling zero accuracy."""
12
+ try:
13
+ df = pd.read_csv('data/processed_results.csv')
14
+ except FileNotFoundError:
15
+ st.error("File 'processed_results.csv' not found.")
16
+ st.stop()
17
+
18
+ epsilon = 1e-6
19
+ num_zero_acc = (df['accuracy'] <= 0).sum()
20
+ if num_zero_acc > 0:
21
+ st.warning(f"Found {num_zero_acc} zero/negative accuracy values. Replacing with {epsilon}.")
22
+ df.loc[df['accuracy'] <= 0, 'accuracy'] = epsilon
23
+
24
+ df['log_accuracy'] = np.log(df['accuracy'])
25
+ return df
26
+
27
+ df = load_data()
28
+
29
+ # Filters
30
+ st.header("Filters")
31
+
32
+ col1, col2, col3 = st.columns(3)
33
+
34
+ with col1:
35
+ datasets = df['dataset'].unique()
36
+ selected_datasets = st.multiselect("Dataset(s)", datasets, default=datasets)
37
+ # Filter data based on selected datasets first
38
+ filtered_df = df[df['dataset'].isin(selected_datasets)]
39
+
40
+ lengths = sorted(filtered_df['length'].unique())
41
+ # Disable length filter if no datasets are selected
42
+ disabled = not selected_datasets
43
+ selected_lengths = st.multiselect("Length(s)", lengths, default=lengths if not disabled and lengths else [], disabled=disabled)
44
+
45
+
46
+ with col2:
47
+ # Single Model Multiselect (filtered by selected datasets)
48
+ available_models = filtered_df['model'].unique()
49
+ selected_models = st.multiselect("Model(s)", available_models, default=available_models) # Handle empty defaults
50
+
51
+ with col3:
52
+ min_op, max_op = st.slider("Op Range", int(filtered_df['op'].min()), int(filtered_df['op'].max()), (int(filtered_df['op'].min()), int(filtered_df['op'].max())))
53
+ min_acc, max_acc = st.slider("Accuracy Range", float(filtered_df['accuracy'].min()), float(filtered_df['accuracy'].max()), (float(filtered_df['accuracy'].min()), float(filtered_df['accuracy'].max())))
54
+
55
+
56
+ filtered_df = filtered_df[filtered_df['model'].isin(selected_models) & filtered_df['length'].isin(selected_lengths)]
57
+
58
+
59
+ filtered_df = filtered_df[(filtered_df['op'] >= min_op) & (filtered_df['op'] <= max_op) & (filtered_df['accuracy'] >= min_acc) & (filtered_df['accuracy'] <= max_acc)]
60
+
61
+
62
+ if filtered_df.empty:
63
+ st.warning("No data for selected filters.")
64
+ st.stop()
65
+
66
+ def plot_data(filtered_df, selected_models, selected_lengths, log_scale=False):
67
+ """Plots data vs N, showing different datasets for the same model."""
68
+ fig = go.Figure()
69
+
70
+ for model in selected_models:
71
+ for length in selected_lengths:
72
+ for dataset in filtered_df['dataset'].unique():
73
+ subset_df = filtered_df[(filtered_df['model'] == model) & (filtered_df['length'] == length) & (filtered_df['dataset'] == dataset)]
74
+ if not subset_df.empty:
75
+ y_data = subset_df['log_accuracy'] if log_scale else subset_df['accuracy']
76
+ fig.add_trace(go.Scatter(
77
+ x=subset_df['op'],
78
+ y=y_data,
79
+ mode='lines+markers',
80
+ name=f'{model} Length {length} ({dataset})',
81
+ marker=dict(size=6)
82
+ ))
83
+
84
+ y_title = "Log(Accuracy)" if log_scale else "Accuracy"
85
+ fig.update_layout(title=f"{y_title} vs Op", xaxis_title="Op", yaxis_title=y_title)
86
+ return fig
87
+
88
+ view_option = st.radio("View", ["Accuracy", "Log(Accuracy)"])
89
+
90
+ if view_option == "Accuracy":
91
+ fig = plot_data(filtered_df, selected_models, selected_lengths, log_scale=False)
92
+ else: # Log(Accuracy)
93
+ fig = plot_data(filtered_df, selected_models, selected_lengths, log_scale=True)
94
+
95
+ st.plotly_chart(fig, use_container_width=True)
96
+
97
+ if st.checkbox("Show Data Table"):
98
+ st.subheader("Filtered Data")
99
+ st.write(filtered_df)
pages/long_context.py CHANGED
@@ -41,6 +41,5 @@ def show():
41
  **Benchmark Details**:
42
  - Evaluated on Symbolic, Medium, and Hard subtasks
43
  - AUC scores aggregated across context lengths
44
- - Larger context evaluations limited by compute constraints
45
- - Scores normalized across task complexities
46
  """)
 
41
  **Benchmark Details**:
42
  - Evaluated on Symbolic, Medium, and Hard subtasks
43
  - AUC scores aggregated across context lengths
44
+ - Larger context evaluations limited by compute constraints and model performance
 
45
  """)
pages/zero_context.py CHANGED
@@ -40,7 +40,8 @@ def show():
40
 
41
  # You can leave your explanation/description below
42
  st.markdown("""
43
- **Evaluation Criteria:**
44
- - **AUC Calculation:** ...
45
- - **Threshold Ops:** ...
 
46
  """)
 
40
 
41
  # You can leave your explanation/description below
42
  st.markdown("""
43
+ **Benchmark Details**:
44
+ - Evaluated on Symbolic, Medium, and Hard subtasks.
45
+ - Area Under Curve Metrics is Used to Compare between LLM Performance.
46
+ - AUC is calculated using np.trapz function.
47
  """)