PeterKruger commited on
Commit
61cd432
·
1 Parent(s): 364e95c

Add application code, data, requirements, and ignore venv

Browse files
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Virtual Environment
2
+ venv/
3
+
4
+ # Python cache
5
+ __pycache__/
6
+ *.pyc
7
+ *.pyo
8
+ *.pyd
9
+
10
+ # Environment variables
11
+ .env
12
+
13
+ # IDE / Editor specific files (Optional examples)
14
+ .vscode/
15
+ .idea/
16
+ *.swp
app.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import os # To check if files exist
5
+
6
+ # --- Configuration ---
7
+ DATA_DIR = "." # Assume CSV files are in the same directory as app.py
8
+ SUMMARY_FILE = os.path.join(DATA_DIR, "data/summary_data.csv")
9
+ DOMAIN_RANKS_FILE = os.path.join(DATA_DIR, "data/domain_ranks.csv")
10
+ COST_FILE = os.path.join(DATA_DIR, "data/cost_data.csv")
11
+ AVG_LATENCY_FILE = os.path.join(DATA_DIR, "data/avg_latency.csv")
12
+ P99_LATENCY_FILE = os.path.join(DATA_DIR, "data/p99_latency.csv")
13
+
14
+ # --- Helper Function to Load Data ---
15
+ def load_data(filepath, separator=','):
16
+ """Loads data, handling potential file not found errors."""
17
+ if not os.path.exists(filepath):
18
+ print(f"Warning: Data file not found at {filepath}")
19
+ return pd.DataFrame() # Return empty DataFrame
20
+ try:
21
+ # Adjust separator if needed (e.g., sep='\t' for tab-separated)
22
+ df = pd.read_csv(filepath, sep=separator)
23
+ # Basic cleanup: remove potential unnamed index columns often added by spreadsheets
24
+ df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
25
+ # Attempt to convert numeric columns, coercing errors to NaN
26
+ for col in df.columns:
27
+ if col != 'Model Name' and col != 'model_name': # Avoid converting model names
28
+ # Check if column might represent numeric data before converting
29
+ if df[col].astype(str).str.contains(r'^[0-9.,eE-]+$').any():
30
+ df[col] = pd.to_numeric(df[col], errors='coerce')
31
+ return df
32
+ except Exception as e:
33
+ print(f"Error loading {filepath}: {e}")
34
+ return pd.DataFrame()
35
+
36
+ # --- Load All Data ---
37
+ print("Loading data...")
38
+ df_summary = load_data(SUMMARY_FILE)
39
+ df_domain = load_data(DOMAIN_RANKS_FILE)
40
+ df_cost = load_data(COST_FILE)
41
+ df_avg_latency = load_data(AVG_LATENCY_FILE)
42
+ df_p99_latency = load_data(P99_LATENCY_FILE)
43
+ print("Data loading complete.")
44
+
45
+ # --- *** NEW: Convert Costs to Cents *** ---
46
+ COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
47
+ NEW_COST_COLUMN_SUMMARY = 'Avg Cost (Cents)' # This is the new name we'll use
48
+
49
+ # Convert summary cost
50
+ if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
51
+ df_summary[COST_COLUMN_SUMMARY] = pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100
52
+ df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
53
+ print(f"Converted '{COST_COLUMN_SUMMARY}' to Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
54
+ else:
55
+ print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
56
+
57
+ # Convert cost breakdown data
58
+ if not df_cost.empty:
59
+ # IMPORTANT: Check if your model name column in cost_data.csv is 'model_name' or 'Model Name' etc.
60
+ model_col_name = 'model_name' # Adjust if needed
61
+ cost_cols = [col for col in df_cost.columns if col != model_col_name]
62
+ for col in cost_cols:
63
+ # Handle potential non-numeric data gracefully before multiplying
64
+ df_cost[col] = pd.to_numeric(df_cost[col], errors='coerce') * 100
65
+ print("Converted cost breakdown columns to Cents in df_cost.")
66
+ # --- *** End of Cost Conversion *** ---
67
+
68
+ # Rename columns for clarity if needed (example for summary)
69
+ # Make sure the original names match your CSV headers EXACTLY
70
+ try:
71
+ df_summary = df_summary.rename(columns={
72
+ 'Model Name': 'Model', # If your CSV uses 'Model Name'
73
+ # Add other renames here if your CSV headers differ from the target names below
74
+ # 'Costs (USD)': 'Avg Cost (USD/response)',
75
+ # 'Avg Answer Duration (sec)': 'Avg Latency (s)',
76
+ # 'P99 Answer Duration (sec)': 'P99 Latency (s)'
77
+ })
78
+ # Select and reorder columns for the main table
79
+ summary_cols_display = ['Model', 'AB', 'CBA', 'AAII', 'MMLU', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)'] # <-- MODIFIED
80
+ # Filter to only columns that actually exist after loading and renaming
81
+ summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
82
+ df_summary_display = df_summary[summary_cols_display]
83
+
84
+ # Ensure AB score is numeric for sorting
85
+ if 'AB' in df_summary_display.columns:
86
+ df_summary_display['AB'] = pd.to_numeric(df_summary_display['AB'], errors='coerce')
87
+ df_summary_display = df_summary_display.sort_values(by='AB', ascending=False)
88
+ else:
89
+ print("Warning: 'AB' column not found for sorting summary table.")
90
+
91
+ except KeyError as e:
92
+ print(f"Error preparing summary display columns: Missing key {e}. Check CSV headers and rename mapping.")
93
+ df_summary_display = df_summary # Fallback to raw loaded data
94
+
95
+
96
+ # --- Build Gradio App ---
97
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
98
+ gr.Markdown("# AutoBench LLM Leaderboard")
99
+ gr.Markdown(
100
+ "Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
101
+ "Includes performance, cost, and latency metrics.\n"
102
+ "More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
103
+ )
104
+
105
+ # --- Tab 1: Overall Ranking ---
106
+ with gr.Tab("Overall Ranking"):
107
+ gr.Markdown("## Overall Model Performance")
108
+ gr.Markdown("Models ranked by AutoBench score. Correlations: AB vs CBA: 83.74%, AB vs AAII: 72.49%. (Lower cost [Cents]/latency is better).") # <-- MODIFIED
109
+ # Check if df_summary_display has data before rendering
110
+ if not df_summary_display.empty:
111
+ gr.DataFrame(
112
+ df_summary_display,
113
+ datatype=['str'] + ['number'] * (len(df_summary_display.columns) - 1), # Assume first col is text, rest numbers
114
+ interactive=True, # Allows sorting
115
+ # height=600 # Adjust height as needed
116
+ )
117
+ else:
118
+ gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
119
+
120
+ # --- Tab 2: Performance Plots ---
121
+ with gr.Tab("Performance Plots"):
122
+ gr.Markdown("## Performance Visualizations")
123
+ gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
124
+
125
+ # Scatter Plot 1 (using summary data)
126
+ gr.Markdown("### Rank vs. Average Cost")
127
+ if not df_summary.empty and 'AB' in df_summary.columns and NEW_COST_COLUMN_SUMMARY in df_summary.columns:
128
+ # Filter out rows where essential plot data might be missing
129
+ plot_df = df_summary.dropna(subset=['AB', NEW_COST_COLUMN_SUMMARY, 'Model']).copy()
130
+ plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
131
+ plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
132
+
133
+ if not plot_df.empty:
134
+ fig_cost = px.scatter(
135
+ plot_df,
136
+ x=NEW_COST_COLUMN_SUMMARY,
137
+ y="AB",
138
+ text="Model", # Show model name near point
139
+ log_x=True, # Use log scale for cost
140
+ title="AutoBench Rank vs. Average Cost per Response (USD Cents - Log Scale)",
141
+ labels={'AB': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost (USD Cents) - Log Scale'},
142
+ hover_data=['Model', 'AB', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] # Show details on hover
143
+ )
144
+ fig_cost.update_traces(textposition='top center')
145
+ fig_cost.update_layout(
146
+ xaxis_title="Avg Cost ($ Cents) - Log Scale", # Keep bottom axis title
147
+ yaxis_title="AutoBench Rank",
148
+ width=1000, # Your existing width
149
+ height=800, # Your existing height (if you added it)
150
+ # --- ADD THE FOLLOWING ---
151
+ xaxis2=dict(
152
+ overlaying='x', # Link to primary x-axis
153
+ matches='x', # Explicitly match primary x-axis properties (like type='log')
154
+ side='top', # Position on top
155
+ showticklabels=True,# Show the labels (numbers)
156
+ showline=True, # Explicitly show the axis line itself
157
+ title=None # No title for the top axis
158
+ )
159
+ # --- END OF ADDITION ---
160
+ )
161
+ gr.Plot(fig_cost)
162
+ else:
163
+ gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check 'AB' and NEW_COST_COLUMN_SUMMARY columns in `summary_data.csv`)_")
164
+ else:
165
+ gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
166
+
167
+ # Plot 2: Rank vs Average Latency
168
+ gr.Markdown("### Rank vs. Average Latency")
169
+ if not df_summary.empty and 'AB' in df_summary.columns and 'Avg Answer Duration (sec)' in df_summary.columns:
170
+ # Filter out rows where essential plot data might be missing
171
+ plot_df_avg_latency = df_summary.dropna(subset=['AB', 'Avg Answer Duration (sec)', 'Model']).copy()
172
+ plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
173
+ plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
174
+
175
+ if not plot_df_avg_latency.empty:
176
+ fig_avg_latency = px.scatter(
177
+ plot_df_avg_latency,
178
+ x="Avg Answer Duration (sec)",
179
+ y="AB",
180
+ text="Model",
181
+ log_x=True, # Use log scale for latency - adjust if not desired
182
+ title="AutoBench Rank vs. Average Latency (Log Scale)",
183
+ labels={'AB': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'},
184
+ hover_data=['Model', 'AB', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
185
+ )
186
+ fig_avg_latency.update_traces(textposition='top center')
187
+ fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
188
+ gr.Plot(fig_avg_latency)
189
+ else:
190
+ gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check 'AB' and 'Avg Answer Duration (sec)' columns in `summary_data.csv`)_")
191
+ else:
192
+ gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
193
+
194
+
195
+ # Plot 3: Rank vs P99 Latency
196
+ gr.Markdown("### Rank vs. P99 Latency")
197
+ if not df_summary.empty and 'AB' in df_summary.columns and 'P99 Answer Duration (sec)' in df_summary.columns:
198
+ # Filter out rows where essential plot data might be missing
199
+ plot_df_p99_latency = df_summary.dropna(subset=['AB', 'P99 Answer Duration (sec)', 'Model']).copy()
200
+ plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
201
+ plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
202
+
203
+ if not plot_df_p99_latency.empty:
204
+ fig_p99_latency = px.scatter(
205
+ plot_df_p99_latency,
206
+ x="P99 Answer Duration (sec)",
207
+ y="AB",
208
+ text="Model",
209
+ log_x=True, # Use log scale for latency - adjust if not desired
210
+ title="AutoBench Rank vs. P99 Latency (Log Scale)",
211
+ labels={'AB': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'},
212
+ hover_data=['Model', 'AB', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
213
+ )
214
+ fig_p99_latency.update_traces(textposition='top center')
215
+ fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
216
+ gr.Plot(fig_p99_latency)
217
+ else:
218
+ gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check 'AB' and 'P99 Answer Duration (sec)' columns in `summary_data.csv`)_")
219
+ else:
220
+ gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
221
+
222
+ # --- Tab 3: Cost & Latency Analysis ---
223
+ with gr.Tab("Cost & Latency Analysis"):
224
+ gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
225
+
226
+ # Cost Breakdown Table
227
+ gr.Markdown("### Cost Breakdown per Domain (USD Cents/Response)") # <-- MODIFIED
228
+ if not df_cost.empty:
229
+ # Make model name the first column if it exists
230
+ if 'model_name' in df_cost.columns:
231
+ cols = ['model_name'] + [col for col in df_cost.columns if col != 'model_name']
232
+ df_cost_display = df_cost[cols]
233
+ else:
234
+ df_cost_display = df_cost # Use as is if 'model_name' isn't found
235
+ gr.DataFrame(df_cost_display, interactive=True)
236
+ else:
237
+ gr.Markdown("_(Cost breakdown data failed to load or is empty. Please check `cost_data.csv`)_")
238
+
239
+ # Latency Breakdown Tables
240
+ gr.Markdown("### Average Latency Breakdown per Domain (Seconds)")
241
+ if not df_avg_latency.empty:
242
+ if 'model_name' in df_avg_latency.columns:
243
+ cols = ['model_name'] + [col for col in df_avg_latency.columns if col != 'model_name']
244
+ df_avg_latency_display = df_avg_latency[cols]
245
+ else:
246
+ df_avg_latency_display = df_avg_latency
247
+ gr.DataFrame(df_avg_latency_display, interactive=True)
248
+ else:
249
+ gr.Markdown("_(Average latency data failed to load or is empty. Please check `avg_latency.csv`)_")
250
+
251
+ gr.Markdown("### P99 Latency Breakdown per Domain (Seconds)")
252
+ if not df_p99_latency.empty:
253
+ if 'model_name' in df_p99_latency.columns:
254
+ cols = ['model_name'] + [col for col in df_p99_latency.columns if col != 'model_name']
255
+ df_p99_latency_display = df_p99_latency[cols]
256
+ else:
257
+ df_p99_latency_display = df_p99_latency
258
+ gr.DataFrame(df_p99_latency_display, interactive=True)
259
+ else:
260
+ gr.Markdown("_(P99 latency data failed to load or is empty. Please check `p99_latency.csv`)_")
261
+
262
+
263
+ # --- Tab 4: Domain Performance ---
264
+ with gr.Tab("Domain Performance"):
265
+ gr.Markdown("## Performance Across Different Domains")
266
+ gr.Markdown("Model ranks within specific knowledge or task areas. Higher is better.")
267
+ if not df_domain.empty:
268
+ if 'Model Name' in df_domain.columns:
269
+ # Attempt to make Model Name first col
270
+ cols = ['Model Name'] + [col for col in df_domain.columns if col != 'Model Name']
271
+ df_domain_display = df_domain[cols]
272
+ else:
273
+ df_domain_display = df_domain # Use as is
274
+ gr.DataFrame(df_domain_display, interactive=True)
275
+ else:
276
+ gr.Markdown("_(Domain ranks data failed to load or is empty. Please check `domain_ranks.csv`)_")
277
+
278
+ # --- Tab 5: About ---
279
+ with gr.Tab("About AutoBench"):
280
+ gr.Markdown("""
281
+ ## About AutoBench
282
+
283
+ AutoBench is an LLM benchmark where Large Language Models (LLMs) evaluate and rank the responses generated by other LLMs. The questions themselves are also generated by LLMs across a diverse set of domains and ranked for quality.
284
+
285
+ ### Methodology
286
+ 1. **Question Generation:** High-quality questions across various domains (Coding, History, Science, etc.) are generated by capable LLMs.
287
+ 2. **Response Generation:** The models being benchmarked generate answers to these questions.
288
+ 3. **Ranking:** A high-capability LLM (e.g., GPT-4, Claude 3) ranks the responses from different models for each question, typically on a scale (e.g., 1-5).
289
+ 4. **Aggregation:** Scores are averaged across multiple questions and domains to produce the final AutoBench rank.
290
+
291
+ ### Metrics
292
+ * **AutoBench Score (AB):** The average rank received by a model's responses across all questions/domains (higher is better).
293
+ * **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
294
+ * **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
295
+ * **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
296
+ * **CBA / AAII / MMLU:** Scores from other well-known benchmarks for comparison (where available).
297
+
298
+ ### Data
299
+ This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..
300
+
301
+ ### Links
302
+ * [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)
303
+ * [Leaderboard Source Code](https://huggingface.co/spaces/<your-username>/<your-space-name>/tree/main)
304
+
305
+ **Disclaimer:** Benchmark results provide one perspective on model capabilities. Performance can vary based on specific tasks, prompts, and API conditions. Costs are estimates and subject to change by providers. Latency depends on server load and geographic location.
306
+ """)
307
+
308
+ # --- Launch the App ---
309
+ print("Launching Gradio app...")
310
+ app.launch()
311
+ print("Gradio app launched.")
data/avg_latency.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,coding,creative writing,current news,general culture,grammar,history,logics,math,science,technology,Average (All Topics)
2
+ claude-3.5-haiku-20241022,15.48,11.25,10.95,10.37,9.97,10.82,8.59,11.27,9.6,9.76,10.8
3
+ claude-3.7-sonnet,23.57,16.24,14.73,16.55,13.08,17.49,10.55,14.46,13.77,14.83,15.53
4
+ claude-3.7-sonnet:thinking,71.38,35.11,30.36,34.95,34.29,39,58.43,85.34,34.01,35.17,45.8
5
+ deepSeek-R1,132.9,72.63,45.19,49.89,63.33,45.66,136.12,205.27,49.02,47.69,84.77
6
+ deepSeek-V3,71.53,28.24,28.23,32.61,27.73,26.69,32.57,44.95,23.61,29.47,34.57
7
+ deepSeek-V3-0324,60.1,29.61,38.87,31.23,31.3,30.26,49.36,70.08,34.02,47.99,42.28
8
+ gemini-2.0-flash-001,10.77,3.26,4.94,4.9,5.14,4.94,5.33,7.55,5.5,5.24,5.76
9
+ gemini-2.5-pro-preview-03-25,51.62,23.1,25.99,29.23,32.35,29.55,49.82,68.76,27.3,27.97,36.57
10
+ gemma-3-27b-it,57.3,18.12,26.05,21.7,24.51,25.17,23.42,40.69,34.57,28.76,30.03
11
+ gpt-4.1-mini,24.38,9.05,11.06,11.79,14.19,12.07,17.77,30.85,11.08,11.55,15.38
12
+ gpt-4o-mini,16.86,11.6,10.77,11.06,10.29,10.93,11.29,18.05,10.2,10.68,12.17
13
+ grok-2-1212,16.88,8.21,9.83,10.24,9.54,10.44,12.2,20.29,9.47,10.32,11.74
14
+ grok-3-beta,44.1,28.57,28.82,30.47,35.2,30.32,37.7,42.02,26.85,35.39,33.94
15
+ llama-3.1-Nemotron-70B-Instruct-HF,35.44,17.3,21.43,23.43,23.41,23.64,24.97,37.67,21.89,21.21,25.04
16
+ llama-3.3-70B-Instruct,42.84,19.57,26.71,33.2,26.5,27.23,31.8,42.4,32.56,27.52,31.03
17
+ llama-3_1-Nemotron-Ultra-253B-v1,70.17,23.7,23.43,24.11,31.86,21.37,80.5,116.39,24.5,22.37,43.84
18
+ llama-4-Maverick-17B-128E-Instruct-FP8,19.09,5.48,7.29,8.48,7.92,7.71,11.39,15.29,7.93,6.97,9.76
19
+ llama-4-Scout-17B-16E-Instruct,15.31,5.74,7.21,7.8,6.92,7.76,8.66,11.21,7.25,7.04,8.49
20
+ mistral-large-2411,52.36,24.34,26.9,29.83,22.78,28.73,25.14,33.72,19.13,28.87,29.18
21
+ mistral-small-24b-instruct-2501,20.56,13.11,13.6,11.5,11.19,10.28,11.95,22.39,12.28,13.1,13.99
22
+ nova-lite-v1,6.84,4.6,5.93,4.74,4.62,4.55,4.67,6.3,4.74,5.24,5.22
23
+ nova-pro-v1,9.29,6.08,4.63,5.19,5.01,4.59,5.11,7.41,4.64,4.54,5.65
24
+ o3-mini-2025-01-31,15.17,7.85,7.25,8.29,8.8,7.57,15.26,22.95,7.32,6.45,10.69
25
+ o4-mini-2025-04-16,25.56,14.83,14.82,15.96,19.49,15.95,21.58,34.85,14.75,13.26,19.1
26
+ qwen-plus,52.6,40.48,28.34,28.7,29.42,28.43,33.31,49.76,26.57,29.67,34.73
data/cost_data.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,coding,creative writing,current news,general culture,grammar,history,logics,math,science,technology,Average (All Topics)
2
+ claude-3.5-haiku-20241022,0.0032948,0.00154037,0.0016057,0.0016259,0.00151862,0.00178182,0.00171982,0.00205066,0.0015184,0.00161418,0.00182703
3
+ claude-3.7-sonnet,0.02260275,0.00883012,0.00904476,0.01127775,0.00853292,0.01137565,0.0083595,0.01336142,0.0100265,0.00998204,0.01133934
4
+ claude-3.7-sonnet:thinking,0.079704,0.02546725,0.02257356,0.02740737,0.02584362,0.02936273,0.06541866,0.10232284,0.02794467,0.02593431,0.0431979
5
+ deepSeek-R1,0.0082304,0.0031812,0.0028937,0.00284753,0.00369708,0.00281948,0.00861731,0.01360111,0.00290555,0.00279671,0.00515901
6
+ deepSeek-V3,0.00143918,0.0008705,0.00080466,0.0008034,0.00072983,0.00073238,0.00111661,0.00146937,0.00069503,0.00076635,0.00094273
7
+ deepSeek-V3-0324,0.00155228,0.00063994,0.0008007,0.00070746,0.00073825,0.00071352,0.00151636,0.00165223,0.00068744,0.00120867,0.00102168
8
+ gemini-2.0-flash-001,0.00077989,0.00015062,0.00026746,0.00026793,0.00027084,0.00026226,0.00037649,0.00056978,0.00029988,0.00029987,0.0003545
9
+ gemini-2.5-pro-preview-03-25,0.02947544,0.00636458,0.00975805,0.01222104,0.01205163,0.01157245,0.0078375,0.0108456,0.01127743,0.01112851,0.01225322
10
+ gemma-3-27b-it,0.0004438,0.00015774,0.00020777,0.00020572,0.00019867,0.00019412,0.00023203,0.00041328,0.00023526,0.00022656,0.00025149
11
+ gpt-4.1-mini,0.00253364,0.00088755,0.00095741,0.000937,0.00120217,0.00097678,0.00193531,0.00309102,0.000936,0.001089,0.00145459
12
+ gpt-4o-mini,0.00059473,0.00037163,0.00030421,0.00032765,0.00028369,0.00031396,0.00040208,0.00063024,0.0002965,0.00034064,0.00038653
13
+ grok-2-1212,0.0130352,0.00550725,0.00660968,0.00663592,0.00646162,0.00686785,0.00951731,0.01720774,0.00614067,0.00673246,0.00847157
14
+ grok-3-beta,0.026382,0.01027425,0.01239936,0.01299412,0.01412204,0.01317081,0.02142797,0.02826706,0.012175,0.01828696,0.01694996
15
+ llama-3.1-Nemotron-70B-Instruct-HF,0.00054719,0.00027321,0.00032472,0.00034392,0.00034446,0.00035163,0.00042114,0.00061984,0.00031967,0.00031898,0.00038647
16
+ llama-3.3-70B-Instruct,0.00051491,0.00020951,0.00031271,0.00033135,0.00028526,0.00032196,0.00040135,0.00055238,0.00030856,0.00031853,0.00035565
17
+ llama-3_1-Nemotron-Ultra-253B-v1,0.00510581,0.00172625,0.00167261,0.0016562,0.0022302,0.00150722,0.00604944,0.00848181,0.0016631,0.00154232,0.0031635
18
+ llama-4-Maverick-17B-128E-Instruct-FP8,0.00107845,0.00042373,0.00052103,0.00054683,0.00054633,0.0005472,0.00092127,0.00109058,0.00052545,0.00051867,0.00067195
19
+ llama-4-Scout-17B-16E-Instruct,0.0008043,0.0003345,0.00041023,0.00042973,0.00037571,0.00041703,0.00054822,0.00068883,0.00037612,0.00038535,0.000477
20
+ mistral-large-2411,0.0083134,0.00458883,0.00470712,0.00462658,0.00415023,0.00458323,0.00468037,0.00716394,0.00469011,0.00497415,0.0052478
21
+ mistral-small-24b-instruct-2501,0.00018066,0.00010744,0.00010954,0.00010174,0.0000935,0.00009791,0.00013218,0.00017749,0.00009814,0.00010746,0.00012061
22
+ nova-lite-v1,0.00024167,0.0001164,0.00011943,0.00013032,0.00012636,0.00013363,0.00017926,0.00028925,0.00012305,0.00012955,0.00015889
23
+ nova-pro-v1,0.002483,0.00148737,0.00105101,0.00114887,0.00108769,0.00103572,0.00139458,0.00209969,0.00096489,0.0010052,0.0013758
24
+ o3-mini-2025-01-31,0.00929378,0.00434688,0.00347006,0.00375008,0.0041121,0.00384886,0.0095952,0.01557568,0.00371201,0.00355482,0.00612595
25
+ o4-mini-2025-04-16,0.01199341,0.00602218,0.00628254,0.00633765,0.00797699,0.0060602,0.00986951,0.01304156,0.00612248,0.00559202,0.00792985
26
+ qwen-plus,0.00148482,0.00108252,0.00073573,0.00073057,0.00073734,0.00074188,0.00102462,0.0014788,0.0006776,0.00077935,0.00094732
data/domain_ranks.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Name,logics,coding,technology,history,science,general culture,creative writing,grammar,current news,math,General Average
2
+ claude-3.5-haiku-20241022,3.85,4,4.07,4.15,4.05,4.11,4.2,3.98,4.04,3.44,3.99
3
+ claude-3.7-sonnet,3.96,4.27,4.3,4.34,4.29,4.31,4.41,4.14,4.15,3.87,4.2
4
+ claude-3.7-sonnet:thinking,4.18,4.48,4.48,4.54,4.45,4.48,4.48,4.4,4.32,4.06,4.39
5
+ deepSeek-R1,3.97,4.05,4.39,4.39,4.35,4.35,4.46,4.32,4.29,3.95,4.26
6
+ deepSeek-V3,4.04,4.01,4.12,4.06,4.13,4.14,4.32,4.11,4.08,3.91,4.09
7
+ deepSeek-V3-0324,4.07,4.25,4.13,4.18,4.11,4.17,4.33,4.22,4.17,3.97,4.16
8
+ gemini-2.0-flash-001,3.97,4.18,4.29,4.3,4.25,4.28,3.99,4.24,4.18,3.85,4.16
9
+ gemini-2.5-pro-preview-03-25,4.17,4.5,4.59,4.6,4.56,4.59,4.42,4.53,4.48,4.17,4.46
10
+ gemma-3-27b-it,3.9,3.98,4.34,4.38,4.33,4.36,4.35,4.33,4.29,3.7,4.2
11
+ gpt-4.1-mini,4.3,4.42,4.4,4.32,4.3,4.3,4.41,4.44,4.22,4.34,4.34
12
+ gpt-4o-mini,3.82,3.97,4.07,4.03,4.07,4.1,4.2,3.97,4,3.79,4
13
+ grok-2-1212,3.92,4.12,4.14,4.16,4.19,4.17,4.17,4.16,4.08,3.87,4.1
14
+ grok-3-beta,4.05,4.33,4.36,4.45,4.42,4.43,4.47,4.54,4.36,4.07,4.34
15
+ llama-3.1-Nemotron-70B-Instruct-HF,3.99,4.1,4.29,4.32,4.3,4.32,4.3,4.27,4.2,3.68,4.18
16
+ llama-3.3-70B-Instruct,3.93,3.83,4.21,4.13,4.15,4.17,4.02,4.1,4.07,3.52,4.02
17
+ llama-3_1-Nemotron-Ultra-253B-v1,4.06,4.17,4.36,4.34,4.31,4.33,4.38,4.36,4.33,3.91,4.26
18
+ llama-4-Maverick-17B-128E-Instruct-FP8,3.86,3.98,4.1,4.1,4.1,4.05,4.04,4.1,3.99,3.64,4
19
+ llama-4-Scout-17B-16E-Instruct,3.89,3.88,4.09,4.11,4.14,4.1,4.04,4.09,4.04,3.53,4
20
+ mistral-large-2411,3.87,3.98,4.18,4.09,4.17,4.08,4.19,4.05,4.07,3.88,4.05
21
+ mistral-small-24b-instruct-2501,3.66,3.86,4.08,4.02,4.09,4.05,3.42,3.94,4.01,3.59,3.88
22
+ nova-lite-v1,3.77,3.73,4.05,4.02,4.02,4.04,3.86,3.9,3.86,3.56,3.89
23
+ nova-pro-v1,3.74,3.81,3.86,3.82,3.86,3.9,4.06,3.91,3.78,3.56,3.83
24
+ o3-mini-2025-01-31,4.32,4.44,4.25,4.21,4.21,4.2,4.35,4.23,4.09,4.41,4.26
25
+ o4-mini-2025-04-16,4.48,4.55,4.61,4.61,4.67,4.59,4.51,4.6,4.57,4.57,4.57
26
+ qwen-plus,4.1,4.23,4.24,4.21,4.22,4.17,4.3,4.19,4.06,4.03,4.17
data/p99_latency.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,coding,creative writing,current news,general culture,grammar,history,logics,math,science,technology,Average (All Topics)
2
+ claude-3.5-haiku-20241022,23.7,19.75,19.06,20.65,15.82,17.31,15.7,21.4,13.3,13.16,17.98
3
+ claude-3.7-sonnet,34.5,42.15,35.2,24.45,22,41.91,32.66,45.31,21,29.41,32.86
4
+ claude-3.7-sonnet:thinking,122.38,86.02,45.39,56.93,59.81,58.5,137.09,145.15,53.35,61.38,82.6
5
+ deepSeek-R1,265.87,557.13,73,123.61,195.61,68.46,393.58,391.41,97.07,68.91,223.47
6
+ deepSeek-V3,489.39,52.4,48.32,63.66,57.52,64.71,72.51,89.72,35.64,91.46,106.53
7
+ deepSeek-V3-0324,132.59,60.61,92.63,98.62,64.37,68.91,202.2,230.86,136.28,318.33,140.54
8
+ gemini-2.0-flash-001,13.29,6.62,7.15,7.43,9.12,6.65,10.66,12.31,7.46,7.56,8.82
9
+ gemini-2.5-pro-preview-03-25,78.66,32.41,37.16,59.49,48.05,40.25,128.55,137.5,39.82,39.92,64.18
10
+ gemma-3-27b-it,205.53,47.1,49.13,39.73,59.89,77.24,62.05,98.21,80.09,72.23,79.12
11
+ gpt-4.1-mini,39.52,15.25,20.58,32.46,25.08,28.67,36.87,52.4,19.65,21.41,29.19
12
+ gpt-4o-mini,31.65,18.1,16.75,19.07,20.07,22.09,17.86,37.33,17.63,16.96,21.75
13
+ grok-2-1212,25.85,12.72,13.91,17.66,17.06,17.54,28.11,72.73,13.08,14.55,23.32
14
+ grok-3-beta,80.37,50.31,69.86,44.77,81.45,56.9,90.91,87.52,45.89,89.96,69.79
15
+ llama-3.1-Nemotron-70B-Instruct-HF,65.57,29.35,29.17,33.06,35.17,29.16,44.7,165.37,28.79,27.03,48.74
16
+ llama-3.3-70B-Instruct,83.59,53.04,79.05,70.62,70.25,57.85,69.79,117.32,77.75,57.72,73.7
17
+ llama-3_1-Nemotron-Ultra-253B-v1,157.34,41.92,48.86,54.35,55.23,43.9,205.29,236.74,51.17,49.75,94.45
18
+ llama-4-Maverick-17B-128E-Instruct-FP8,80.03,10.11,13.5,15.19,12.64,13.25,18.17,42.55,12.47,13.21,23.11
19
+ llama-4-Scout-17B-16E-Instruct,26.4,9.28,10.96,11.67,9.58,12,14.52,20.09,10.46,13.2,13.82
20
+ mistral-large-2411,157.72,54.36,82.25,77.68,69.55,160.87,136.52,98.81,29.76,100.24,96.77
21
+ mistral-small-24b-instruct-2501,36.17,26.9,21.63,20.29,16.93,19.97,32.45,75.08,21.31,25.44,29.62
22
+ nova-lite-v1,12.92,5.83,32.65,6.61,6.51,5.7,9.03,19.28,7.8,18.38,12.47
23
+ nova-pro-v1,15.72,11.25,6.86,10.76,7.67,8.17,9.15,14.68,6.84,8.2,9.93
24
+ o3-mini-2025-01-31,35.06,16.4,16.14,20.01,18.56,14.95,39.78,52.4,13.09,10.28,23.67
25
+ o4-mini-2025-04-16,57.74,39.5,24.57,39.62,48.97,33.24,70.85,164.19,25.11,19.22,52.3
26
+ qwen-plus,77.04,72.72,55.77,55.36,69.05,48.73,68.49,121.78,41.3,56.74,66.7
data/summary_data.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,AB,CBA,AAII,MMLU,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec)
2
+ claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.80,17.98
3
+ claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
4
+ claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.80,82.60
5
+ deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
6
+ deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
7
+ deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
8
+ gemini-2.0-flash-001,4.16,1356,48090,0.779,0.0003545,5.76,8.82
9
+ gemini-2.5-pro-preview-03-25,4.46,1439,67840,0.858,0.01225322,36.57,64.18
10
+ gemma-3-27b-it,4.2,1342,37620,0.669,0.00025149,30.03,79.12
11
+ gpt-4.1-mini,4.34,,52860,0.781,0.00145459,15.38,29.19
12
+ gpt-4o-mini,4,1272,35680,0.648,0.00038653,12.17,21.75
13
+ grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
14
+ grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
15
+ llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
16
+ llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.70
17
+ llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
18
+ llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
19
+ llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
20
+ mistral-large-2411,4.05,1249,38270,0.697,0.0052478,29.18,96.77
21
+ mistral-small-24b-instruct-2501,3.88,1217,35280,0.652,0.00012061,13.99,29.62
22
+ nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
23
+ nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
24
+ o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
25
+ o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.10,52.30
26
+ qwen-plus,4.17,1310,,,0.00094732,34.73,66.70
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio
2
+ pandas
3
+ plotly