Add data
Browse files- app.py +130 -68
- inspect_log_file_names.json → data/inspect_log_file_names.json +0 -0
- data/populate_results.py +41 -0
- data/results.json +948 -0
- data/results.json.bak +760 -0
- data/tasks.json +142 -0
- src/about.py +2 -7
- src/display/formatting.py +2 -0
- src/submission/submit.py +119 -119
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
| 3 |
-
from huggingface_hub import snapshot_download
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
|
| 6 |
from src.about import (
|
| 7 |
REPRODUCIBILITY_TEXT,
|
|
@@ -10,64 +9,65 @@ from src.about import (
|
|
| 10 |
TITLE,
|
| 11 |
)
|
| 12 |
from src.display.css_html_js import custom_css, custom_js
|
| 13 |
-
from src.display.utils import (
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
)
|
| 21 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 22 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
|
| 23 |
-
from src.submission.submit import add_new_eval
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
|
| 72 |
# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
|
| 73 |
# df.style.set_table_styles([
|
|
@@ -85,13 +85,78 @@ def init_leaderboard(df, benchmark_type):
|
|
| 85 |
# styled_df = df.style.set_tooltips(tooltips)
|
| 86 |
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
return gr.components.Dataframe(
|
| 89 |
-
value=
|
| 90 |
-
datatype=[
|
| 91 |
-
column_widths=["
|
| 92 |
wrap=False,
|
| 93 |
)
|
| 94 |
|
|
|
|
| 95 |
black_logo_path = "src/assets/logo-icon-black.png"
|
| 96 |
white_logo_path = "src/assets/logo-icon-white.png"
|
| 97 |
|
|
@@ -123,10 +188,10 @@ with demo:
|
|
| 123 |
|
| 124 |
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
|
| 125 |
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
|
| 126 |
-
|
| 127 |
|
| 128 |
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
|
| 129 |
-
|
| 130 |
|
| 131 |
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
|
| 132 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
|
|
@@ -135,8 +200,5 @@ with demo:
|
|
| 135 |
gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
|
| 136 |
|
| 137 |
assets = [black_logo_path, white_logo_path]
|
|
|
|
| 138 |
|
| 139 |
-
scheduler = BackgroundScheduler()
|
| 140 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 141 |
-
scheduler.start()
|
| 142 |
-
demo.queue(default_concurrency_limit=40).launch(allowed_paths=assets)
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
+
import json
|
| 4 |
|
| 5 |
from src.about import (
|
| 6 |
REPRODUCIBILITY_TEXT,
|
|
|
|
| 9 |
TITLE,
|
| 10 |
)
|
| 11 |
from src.display.css_html_js import custom_css, custom_js
|
| 12 |
+
# from src.display.utils import (
|
| 13 |
+
# COLS,
|
| 14 |
+
# ST_BENCHMARK_COLS,
|
| 15 |
+
# AGENTIC_BENCHMARK_COLS,
|
| 16 |
+
# EVAL_COLS,
|
| 17 |
+
# AutoEvalColumn,
|
| 18 |
+
# fields,
|
| 19 |
+
# )
|
| 20 |
+
# from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 21 |
+
# from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
|
| 22 |
+
# from src.submission.submit import add_new_eval
|
| 23 |
+
from src.display.formatting import make_clickable_field
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# def restart_space():
|
| 27 |
+
# API.restart_space(repo_id=REPO_ID)
|
| 28 |
+
|
| 29 |
+
# ### Space initialisation
|
| 30 |
+
# try:
|
| 31 |
+
# print(EVAL_REQUESTS_PATH)
|
| 32 |
+
# snapshot_download(
|
| 33 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 34 |
+
# )
|
| 35 |
+
# except Exception:
|
| 36 |
+
# restart_space()
|
| 37 |
+
# try:
|
| 38 |
+
# print(EVAL_RESULTS_PATH)
|
| 39 |
+
# snapshot_download(
|
| 40 |
+
# repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
| 41 |
+
# )
|
| 42 |
+
# except Exception:
|
| 43 |
+
# restart_space()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
|
| 47 |
+
# AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
|
| 48 |
+
|
| 49 |
+
# (
|
| 50 |
+
# finished_eval_queue_df,
|
| 51 |
+
# running_eval_queue_df,
|
| 52 |
+
# pending_eval_queue_df,
|
| 53 |
+
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 54 |
+
|
| 55 |
+
# def bold_max(s):
|
| 56 |
+
# is_max = s == s.max() # Boolean Series: True for the max value(s)
|
| 57 |
+
# return ['font-weight: bold' if v else '' for v in is_max]
|
| 58 |
+
|
| 59 |
+
# def init_leaderboard(df, benchmark_type):
|
| 60 |
+
# if df is None or df.empty:
|
| 61 |
+
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 62 |
|
| 63 |
+
# non_task_cols = ["Model"]
|
| 64 |
+
# if benchmark_type == "agentic":
|
| 65 |
+
# # Include agent column
|
| 66 |
+
# non_task_cols.append("Agent")
|
| 67 |
+
# elif benchmark_type == "base":
|
| 68 |
+
# # Drop agent column
|
| 69 |
+
# dataframe = dataframe.drop(columns=["Agent"])
|
| 70 |
+
# AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name in non_task_cols) or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
|
| 71 |
|
| 72 |
# styler = dataframe.style.apply(bold_max, subset=pd.IndexSlice[:, dataframe.columns[1:]])
|
| 73 |
# df.style.set_table_styles([
|
|
|
|
| 85 |
# styled_df = df.style.set_tooltips(tooltips)
|
| 86 |
|
| 87 |
|
| 88 |
+
# return gr.components.Dataframe(
|
| 89 |
+
# value=df,
|
| 90 |
+
# datatype=[c.type for c in AutoEvalColumnSubset],
|
| 91 |
+
# column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset],
|
| 92 |
+
# wrap=False,
|
| 93 |
+
# )
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def build_leaderboard(type):
|
| 98 |
+
with open('data/results.json', 'r') as f:
|
| 99 |
+
results = json.load(f)
|
| 100 |
+
|
| 101 |
+
with open('data/tasks.json', 'r') as f:
|
| 102 |
+
tasks = json.load(f)
|
| 103 |
+
|
| 104 |
+
# Filter tasks based on type
|
| 105 |
+
filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type}
|
| 106 |
+
|
| 107 |
+
data = []
|
| 108 |
+
for model_name, model_data in results.items():
|
| 109 |
+
# For agentic type, skip models that have all null values for agentic tasks
|
| 110 |
+
if type == "agentic":
|
| 111 |
+
has_agentic_results = any(
|
| 112 |
+
model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None
|
| 113 |
+
for task in filtered_tasks
|
| 114 |
+
)
|
| 115 |
+
if not has_agentic_results:
|
| 116 |
+
continue
|
| 117 |
+
|
| 118 |
+
model_sha = model_data["config"]["model_sha"]
|
| 119 |
+
model_name = model_data["config"]["model_name"]
|
| 120 |
+
row = {
|
| 121 |
+
'Model': make_clickable_field(model_name, model_sha)
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
for dataset, metrics in model_data['results'].items():
|
| 125 |
+
# Only include metrics for tasks of the specified type
|
| 126 |
+
if dataset in filtered_tasks:
|
| 127 |
+
value = next(iter(metrics.values()))
|
| 128 |
+
log_url = metrics.get('log_url')
|
| 129 |
+
# Use display name from tasks.json instead of raw dataset name
|
| 130 |
+
display_name = filtered_tasks[dataset]['display_name']
|
| 131 |
+
# Round non-null values to 2 decimal places and make clickable if log_url exists
|
| 132 |
+
if value is not None:
|
| 133 |
+
value = round(value*100, 2)
|
| 134 |
+
if log_url:
|
| 135 |
+
value = make_clickable_field(value, log_url)
|
| 136 |
+
row[display_name] = value
|
| 137 |
+
data.append(row)
|
| 138 |
+
|
| 139 |
+
results_df = pd.DataFrame(data)
|
| 140 |
+
|
| 141 |
+
# Round all numeric columns to 2 decimal places
|
| 142 |
+
numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns
|
| 143 |
+
results_df[numeric_cols] = results_df[numeric_cols].round(2)
|
| 144 |
+
|
| 145 |
+
# Fill null values with "-"
|
| 146 |
+
results_df = results_df.fillna("--")
|
| 147 |
+
|
| 148 |
+
if type == "agentic":
|
| 149 |
+
# Include agent column as second column after Model
|
| 150 |
+
results_df.insert(1, 'Agent', '[Basic Agent](https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent)')
|
| 151 |
+
|
| 152 |
return gr.components.Dataframe(
|
| 153 |
+
value=results_df,
|
| 154 |
+
datatype=["html" for _ in results_df.columns],
|
| 155 |
+
column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns],
|
| 156 |
wrap=False,
|
| 157 |
)
|
| 158 |
|
| 159 |
+
|
| 160 |
black_logo_path = "src/assets/logo-icon-black.png"
|
| 161 |
white_logo_path = "src/assets/logo-icon-white.png"
|
| 162 |
|
|
|
|
| 188 |
|
| 189 |
with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
|
| 190 |
with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
|
| 191 |
+
build_leaderboard("base")
|
| 192 |
|
| 193 |
with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
|
| 194 |
+
build_leaderboard("agentic")
|
| 195 |
|
| 196 |
with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
|
| 197 |
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
|
|
|
|
| 200 |
gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
|
| 201 |
|
| 202 |
assets = [black_logo_path, white_logo_path]
|
| 203 |
+
demo.launch()
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
inspect_log_file_names.json → data/inspect_log_file_names.json
RENAMED
|
File without changes
|
data/populate_results.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
def get_log_url(model_name: str, log_file_name: str) -> str:
|
| 4 |
+
"""Returns the URL to the log file for a given model and benchmark"""
|
| 5 |
+
if log_file_name is None:
|
| 6 |
+
return None
|
| 7 |
+
else:
|
| 8 |
+
# replace .json with .eval
|
| 9 |
+
log_file_name = log_file_name.replace(".json", ".eval")
|
| 10 |
+
return f"https://storage.googleapis.com/inspect-evals/eval/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
|
| 11 |
+
|
| 12 |
+
def main():
|
| 13 |
+
# Load the results and log file names
|
| 14 |
+
with open("data/results.json", "r") as f:
|
| 15 |
+
results = json.load(f)
|
| 16 |
+
|
| 17 |
+
with open("data/inspect_log_file_names.json", "r") as f:
|
| 18 |
+
log_files = json.load(f)
|
| 19 |
+
|
| 20 |
+
# For each model in results
|
| 21 |
+
for model_name, model_data in results.items():
|
| 22 |
+
# Get the log files for this model
|
| 23 |
+
model_logs = log_files.get(model_name, {})
|
| 24 |
+
|
| 25 |
+
# For each task in the model's results
|
| 26 |
+
for task_name, task_data in model_data["results"].items():
|
| 27 |
+
# Get the log file name for this task
|
| 28 |
+
log_file_name = model_logs.get(task_name)
|
| 29 |
+
|
| 30 |
+
# Add the log URL to the task data
|
| 31 |
+
if log_file_name:
|
| 32 |
+
task_data["log_url"] = get_log_url(model_name, log_file_name)
|
| 33 |
+
else:
|
| 34 |
+
task_data["log_url"] = None
|
| 35 |
+
|
| 36 |
+
# Save the updated results
|
| 37 |
+
with open("data/results_with_logs.json", "w") as f:
|
| 38 |
+
json.dump(results, f, indent=4)
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
main()
|
data/results.json
ADDED
|
@@ -0,0 +1,948 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"DeepSeek-R1": {
|
| 3 |
+
"config": {
|
| 4 |
+
"model_name": "DeepSeek-R1",
|
| 5 |
+
"model_sha": "https://api-docs.deepseek.com/news/news250120",
|
| 6 |
+
"model_dtype": "torch.float16"
|
| 7 |
+
},
|
| 8 |
+
"results": {
|
| 9 |
+
"mmlu_pro": {
|
| 10 |
+
"accuracy": 0.8382646276595744,
|
| 11 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.eval"
|
| 12 |
+
},
|
| 13 |
+
"humaneval": {
|
| 14 |
+
"mean": 0.9567901234567902,
|
| 15 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-03T11-45-22-05-00_humaneval_hnkHWYqrb5HxiBt2CWzCnq.eval"
|
| 16 |
+
},
|
| 17 |
+
"math": {
|
| 18 |
+
"accuracy": 0.9272,
|
| 19 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.eval"
|
| 20 |
+
},
|
| 21 |
+
"gsm8k": {
|
| 22 |
+
"accuracy": 0.954510993176649,
|
| 23 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.eval"
|
| 24 |
+
},
|
| 25 |
+
"arc_challenge": {
|
| 26 |
+
"accuracy": 0.9667235494880546,
|
| 27 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.eval"
|
| 28 |
+
},
|
| 29 |
+
"winogrande": {
|
| 30 |
+
"accuracy": 0.9179163378058406,
|
| 31 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.eval"
|
| 32 |
+
},
|
| 33 |
+
"arc_easy": {
|
| 34 |
+
"accuracy": 0.9873737373737373,
|
| 35 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.eval"
|
| 36 |
+
},
|
| 37 |
+
"gpqa_diamond": {
|
| 38 |
+
"accuracy": 0.7045454545454546,
|
| 39 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.eval"
|
| 40 |
+
},
|
| 41 |
+
"drop": {
|
| 42 |
+
"mean": null,
|
| 43 |
+
"log_url": null
|
| 44 |
+
},
|
| 45 |
+
"hellaswag": {
|
| 46 |
+
"accuracy": null,
|
| 47 |
+
"log_url": null
|
| 48 |
+
},
|
| 49 |
+
"ifeval": {
|
| 50 |
+
"final_acc": null,
|
| 51 |
+
"log_url": null
|
| 52 |
+
},
|
| 53 |
+
"mmlu": {
|
| 54 |
+
"accuracy": null,
|
| 55 |
+
"log_url": null
|
| 56 |
+
},
|
| 57 |
+
"mmmu_multiple_choice": {
|
| 58 |
+
"accuracy": null,
|
| 59 |
+
"log_url": null
|
| 60 |
+
},
|
| 61 |
+
"mmmu_open": {
|
| 62 |
+
"accuracy": null,
|
| 63 |
+
"log_url": null
|
| 64 |
+
},
|
| 65 |
+
"gaia": {
|
| 66 |
+
"accuracy": null,
|
| 67 |
+
"log_url": null
|
| 68 |
+
},
|
| 69 |
+
"gdm_intercode_ctf": {
|
| 70 |
+
"accuracy": null,
|
| 71 |
+
"log_url": null
|
| 72 |
+
},
|
| 73 |
+
"gdm_in_house_ctf": {
|
| 74 |
+
"accuracy": null,
|
| 75 |
+
"log_url": null
|
| 76 |
+
},
|
| 77 |
+
"agentharm": {
|
| 78 |
+
"avg_score": null,
|
| 79 |
+
"log_url": null
|
| 80 |
+
},
|
| 81 |
+
"agentharm_benign": {
|
| 82 |
+
"avg_score": null,
|
| 83 |
+
"log_url": null
|
| 84 |
+
},
|
| 85 |
+
"swe_bench": {
|
| 86 |
+
"mean": null,
|
| 87 |
+
"log_url": null
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
},
|
| 91 |
+
"Meta-Llama-3.1-70B-Instruct": {
|
| 92 |
+
"config": {
|
| 93 |
+
"model_name": "Meta-Llama-3.1-70B-Instruct",
|
| 94 |
+
"model_sha": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
|
| 95 |
+
"model_dtype": "torch.float16"
|
| 96 |
+
},
|
| 97 |
+
"results": {
|
| 98 |
+
"hellaswag": {
|
| 99 |
+
"accuracy": 0.869946225851424,
|
| 100 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-45-54-04-00_hellaswag_BKfQG9yGAr383MGnooMLBH.eval"
|
| 101 |
+
},
|
| 102 |
+
"drop": {
|
| 103 |
+
"mean": 0.8811263765076035,
|
| 104 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T21-01-02-04-00_drop_LzAWvLWkNrNKu5qf56wXRo.eval"
|
| 105 |
+
},
|
| 106 |
+
"gpqa_diamond": {
|
| 107 |
+
"accuracy": 0.4318181818181818,
|
| 108 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
|
| 109 |
+
},
|
| 110 |
+
"winogrande": {
|
| 111 |
+
"accuracy": 0.8666140489344909,
|
| 112 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
|
| 113 |
+
},
|
| 114 |
+
"gsm8k": {
|
| 115 |
+
"accuracy": 0.9469294920394238,
|
| 116 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
|
| 117 |
+
},
|
| 118 |
+
"math": {
|
| 119 |
+
"accuracy": 0.6004,
|
| 120 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
|
| 121 |
+
},
|
| 122 |
+
"ifeval": {
|
| 123 |
+
"final_acc": 0.8604907201780166,
|
| 124 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
|
| 125 |
+
},
|
| 126 |
+
"arc_challenge": {
|
| 127 |
+
"accuracy": 0.9445392491467577,
|
| 128 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
|
| 129 |
+
},
|
| 130 |
+
"arc_easy": {
|
| 131 |
+
"accuracy": 0.9823232323232324,
|
| 132 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
|
| 133 |
+
},
|
| 134 |
+
"mmlu_pro": {
|
| 135 |
+
"accuracy": 0.6688829787234043,
|
| 136 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
|
| 137 |
+
},
|
| 138 |
+
"humaneval": {
|
| 139 |
+
"mean": 0.7865853658536586,
|
| 140 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
|
| 141 |
+
},
|
| 142 |
+
"mmlu": {
|
| 143 |
+
"accuracy": 0.8033755875231449,
|
| 144 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
|
| 145 |
+
},
|
| 146 |
+
"mmmu_multiple_choice": {
|
| 147 |
+
"accuracy": null,
|
| 148 |
+
"log_url": null
|
| 149 |
+
},
|
| 150 |
+
"mmmu_open": {
|
| 151 |
+
"accuracy": null,
|
| 152 |
+
"log_url": null
|
| 153 |
+
},
|
| 154 |
+
"gaia": {
|
| 155 |
+
"accuracy": null,
|
| 156 |
+
"log_url": null
|
| 157 |
+
},
|
| 158 |
+
"gdm_intercode_ctf": {
|
| 159 |
+
"accuracy": null,
|
| 160 |
+
"log_url": null
|
| 161 |
+
},
|
| 162 |
+
"gdm_in_house_ctf": {
|
| 163 |
+
"accuracy": null,
|
| 164 |
+
"log_url": null
|
| 165 |
+
},
|
| 166 |
+
"agentharm": {
|
| 167 |
+
"avg_score": null,
|
| 168 |
+
"log_url": null
|
| 169 |
+
},
|
| 170 |
+
"agentharm_benign": {
|
| 171 |
+
"avg_score": null,
|
| 172 |
+
"log_url": null
|
| 173 |
+
},
|
| 174 |
+
"swe_bench": {
|
| 175 |
+
"mean": null,
|
| 176 |
+
"log_url": null
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
},
|
| 180 |
+
"Mistral-Large-Instruct-2407": {
|
| 181 |
+
"config": {
|
| 182 |
+
"model_name": "Mistral-Large-Instruct-2407",
|
| 183 |
+
"model_sha": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
|
| 184 |
+
"model_dtype": "torch.float16"
|
| 185 |
+
},
|
| 186 |
+
"results": {
|
| 187 |
+
"drop": {
|
| 188 |
+
"mean": 0.7424257996853698,
|
| 189 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.eval"
|
| 190 |
+
},
|
| 191 |
+
"ifeval": {
|
| 192 |
+
"final_acc": 0.8285172231900246,
|
| 193 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-30-16-04-00_ifeval_TLkvCSFEWo4PLv6hAha7YB.eval"
|
| 194 |
+
},
|
| 195 |
+
"mmlu": {
|
| 196 |
+
"accuracy": 0.8035892323030908,
|
| 197 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T07-21-48-04-00_mmlu_YnUhmHoStr3WuJdchWmNPt.eval"
|
| 198 |
+
},
|
| 199 |
+
"gpqa_diamond": {
|
| 200 |
+
"accuracy": 0.4734848484848485,
|
| 201 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-22-52-04-00_gpqa-diamond_SuZUZxGdqS2ZecbLRNkKd4.eval"
|
| 202 |
+
},
|
| 203 |
+
"gsm8k": {
|
| 204 |
+
"accuracy": 0.9378316906747536,
|
| 205 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-28-49-04-00_gsm8k_5tQp9tbwUMj6NpjNKCAfVm.eval"
|
| 206 |
+
},
|
| 207 |
+
"math": {
|
| 208 |
+
"accuracy": 0.6574,
|
| 209 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-33-09-04-00_math_2CmjBedAfUxqvmcHRdBgyB.eval"
|
| 210 |
+
},
|
| 211 |
+
"arc_easy": {
|
| 212 |
+
"accuracy": 0.9852693602693603,
|
| 213 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-48-39-04-00_arc-easy_YbfuBT3usZXt2xgZkkR5dq.eval"
|
| 214 |
+
},
|
| 215 |
+
"mmlu_pro": {
|
| 216 |
+
"accuracy": 0.6942320478723404,
|
| 217 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T09-41-25-04-00_mmlu-pro_fyYT4aabPesfY5TpzFMPnd.eval"
|
| 218 |
+
},
|
| 219 |
+
"humaneval": {
|
| 220 |
+
"mean": 0.8658536585365854,
|
| 221 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-29-24-04-00_humaneval_nu8SUSGekKJWB8HLKDigYK.eval"
|
| 222 |
+
},
|
| 223 |
+
"hellaswag": {
|
| 224 |
+
"accuracy": 0.9047998406691894,
|
| 225 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-50-00-04-00_hellaswag_ZzQoZ6gkRQsTzMhQr7GYNn.eval"
|
| 226 |
+
},
|
| 227 |
+
"arc_challenge": {
|
| 228 |
+
"accuracy": 0.9436860068259386,
|
| 229 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-54-13-04-00_arc-challenge_WfQRhMkFcywefpU46isBVP.eval"
|
| 230 |
+
},
|
| 231 |
+
"winogrande": {
|
| 232 |
+
"accuracy": 0.8547750591949487,
|
| 233 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T11-57-58-04-00_winogrande_TP3UGwpp37Dyv6ks9Ty5Hk.eval"
|
| 234 |
+
},
|
| 235 |
+
"mmmu_multiple_choice": {
|
| 236 |
+
"accuracy": null,
|
| 237 |
+
"log_url": null
|
| 238 |
+
},
|
| 239 |
+
"mmmu_open": {
|
| 240 |
+
"accuracy": null,
|
| 241 |
+
"log_url": null
|
| 242 |
+
},
|
| 243 |
+
"gaia": {
|
| 244 |
+
"accuracy": null,
|
| 245 |
+
"log_url": null
|
| 246 |
+
},
|
| 247 |
+
"gdm_intercode_ctf": {
|
| 248 |
+
"accuracy": null,
|
| 249 |
+
"log_url": null
|
| 250 |
+
},
|
| 251 |
+
"gdm_in_house_ctf": {
|
| 252 |
+
"accuracy": null,
|
| 253 |
+
"log_url": null
|
| 254 |
+
},
|
| 255 |
+
"agentharm": {
|
| 256 |
+
"avg_score": null,
|
| 257 |
+
"log_url": null
|
| 258 |
+
},
|
| 259 |
+
"agentharm_benign": {
|
| 260 |
+
"avg_score": null,
|
| 261 |
+
"log_url": null
|
| 262 |
+
},
|
| 263 |
+
"swe_bench": {
|
| 264 |
+
"mean": null,
|
| 265 |
+
"log_url": null
|
| 266 |
+
}
|
| 267 |
+
}
|
| 268 |
+
},
|
| 269 |
+
"c4ai-command-r-plus": {
|
| 270 |
+
"config": {
|
| 271 |
+
"model_name": "c4ai-command-r-plus",
|
| 272 |
+
"model_sha": "https://huggingface.co/CohereForAI/c4ai-command-r-plus"
|
| 273 |
+
},
|
| 274 |
+
"results": {
|
| 275 |
+
"ifeval": {
|
| 276 |
+
"final_acc": 0.7779591483929307,
|
| 277 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.eval"
|
| 278 |
+
},
|
| 279 |
+
"winogrande": {
|
| 280 |
+
"accuracy": 0.7490134175217048,
|
| 281 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T14-42-18-04-00_winogrande_bY8yg7aRR5dCCK7NDCZEcc.eval"
|
| 282 |
+
},
|
| 283 |
+
"arc_challenge": {
|
| 284 |
+
"accuracy": 0.8506825938566553,
|
| 285 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T17-30-03-04-00_arc-challenge_XB7LURXEGaxskWuLtYwdnW.eval"
|
| 286 |
+
},
|
| 287 |
+
"drop": {
|
| 288 |
+
"mean": 0.743557420031463,
|
| 289 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T12-06-30-04-00_drop_itY9cLiYAW2BF7NTeDceNd.eval"
|
| 290 |
+
},
|
| 291 |
+
"math": {
|
| 292 |
+
"accuracy": 0.2626,
|
| 293 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-26-34-04-00_math_kohBUMpMFuMsR4jz4vUNWM.eval"
|
| 294 |
+
},
|
| 295 |
+
"gpqa_diamond": {
|
| 296 |
+
"accuracy": 0.3194444444444444,
|
| 297 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T22-47-45-04-00_gpqa-diamond_JKpb6ya4pec9hh7uovPPCZ.eval"
|
| 298 |
+
},
|
| 299 |
+
"mmlu_pro": {
|
| 300 |
+
"accuracy": 0.441156914893617,
|
| 301 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-31T01-11-38-04-00_mmlu-pro_gZVAuy3zMKR23BieM5PqAX.eval"
|
| 302 |
+
},
|
| 303 |
+
"humaneval": {
|
| 304 |
+
"mean": 0.6219512195121951,
|
| 305 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-22-23-04-00_humaneval_5ByPqUhoofSbKgvsUQNFCX.eval"
|
| 306 |
+
},
|
| 307 |
+
"gsm8k": {
|
| 308 |
+
"accuracy": 0.7816527672479151,
|
| 309 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T15-03-35-04-00_gsm8k_QxbfbriJsKGQAg96JyjkoT.eval"
|
| 310 |
+
},
|
| 311 |
+
"hellaswag": {
|
| 312 |
+
"accuracy": 0.7954590718980283,
|
| 313 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T15-18-17-04-00_hellaswag_UYyBTR6N8VJnKRmnbCrB8N.eval"
|
| 314 |
+
},
|
| 315 |
+
"mmlu": {
|
| 316 |
+
"accuracy": 0.695128899017234,
|
| 317 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T21-55-26-04-00_mmlu_JUPPLTzfe3Kme6UuorPTqg.eval"
|
| 318 |
+
},
|
| 319 |
+
"arc_easy": {
|
| 320 |
+
"accuracy": 0.9377104377104377,
|
| 321 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.eval"
|
| 322 |
+
}
|
| 323 |
+
}
|
| 324 |
+
},
|
| 325 |
+
"claude-3-5-sonnet-20241022": {
|
| 326 |
+
"config": {
|
| 327 |
+
"model_name": "claude-3-5-sonnet-20241022",
|
| 328 |
+
"model_sha": "https://www.anthropic.com/claude/sonnet",
|
| 329 |
+
"model_dtype": "torch.float16"
|
| 330 |
+
},
|
| 331 |
+
"results": {
|
| 332 |
+
"mmmu_multiple_choice": {
|
| 333 |
+
"accuracy": 0.6481700118063755,
|
| 334 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.eval"
|
| 335 |
+
},
|
| 336 |
+
"mmlu_pro": {
|
| 337 |
+
"accuracy": 0.7762632978723404,
|
| 338 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T19-01-05-05-00_mmlu-pro_3vi84or97gQupuj5sT6vgZ.eval"
|
| 339 |
+
},
|
| 340 |
+
"hellaswag": {
|
| 341 |
+
"accuracy": 0.9228241386178052,
|
| 342 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T15-09-33-05-00_hellaswag_QXqFxojvSToMu8ckHEMLkB.eval"
|
| 343 |
+
},
|
| 344 |
+
"gpqa_diamond": {
|
| 345 |
+
"accuracy": 0.6098484848484849,
|
| 346 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T13-56-36-05-00_gpqa-diamond_eg4gFaMRENjnnYvQNtSB59.eval"
|
| 347 |
+
},
|
| 348 |
+
"gsm8k": {
|
| 349 |
+
"accuracy": 0.9620924943138741,
|
| 350 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T14-23-25-05-00_gsm8k_nHB8Z4uZAwRAZFYpKmTptA.eval"
|
| 351 |
+
},
|
| 352 |
+
"mmmu_open": {
|
| 353 |
+
"accuracy": 0.41509433962264153,
|
| 354 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T11-24-21-05-00_mmmu-open_SSjv3Dq9gZkEEUnvJUd5xf.eval"
|
| 355 |
+
},
|
| 356 |
+
"arc_easy": {
|
| 357 |
+
"accuracy": 0.9915824915824916,
|
| 358 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-06-24-05-00_arc-easy_oBReQZQM5SAwMMD2jFshPb.eval"
|
| 359 |
+
},
|
| 360 |
+
"arc_challenge": {
|
| 361 |
+
"accuracy": 0.9692832764505119,
|
| 362 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-12-11-05-00_arc-challenge_X8i6caCzkcQo5AT5zXkXso.eval"
|
| 363 |
+
},
|
| 364 |
+
"mmlu": {
|
| 365 |
+
"accuracy": 0.8665432274604757,
|
| 366 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T15-16-51-05-00_mmlu_NFDs2kxmh3kQEbpbd8sz3w.eval"
|
| 367 |
+
},
|
| 368 |
+
"math": {
|
| 369 |
+
"accuracy": 0.7942,
|
| 370 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T12-29-54-05-00_math_NvNQU58M8r3fpiwPGnvq8h.eval"
|
| 371 |
+
},
|
| 372 |
+
"ifeval": {
|
| 373 |
+
"final_acc": 0.8958114469607309,
|
| 374 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.eval"
|
| 375 |
+
},
|
| 376 |
+
"humaneval": {
|
| 377 |
+
"mean": 0.9451219512195121,
|
| 378 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.eval"
|
| 379 |
+
},
|
| 380 |
+
"winogrande": {
|
| 381 |
+
"accuracy": 0.9021310181531176,
|
| 382 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.eval"
|
| 383 |
+
},
|
| 384 |
+
"drop": {
|
| 385 |
+
"mean": 0.8977608809648663,
|
| 386 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.eval"
|
| 387 |
+
},
|
| 388 |
+
"gaia": {
|
| 389 |
+
"accuracy": 0.3381818181818182,
|
| 390 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.eval"
|
| 391 |
+
},
|
| 392 |
+
"gdm_intercode_ctf": {
|
| 393 |
+
"accuracy": 0.8556962025316455,
|
| 394 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.eval"
|
| 395 |
+
},
|
| 396 |
+
"gdm_in_house_ctf": {
|
| 397 |
+
"accuracy": 0.6153846153846154,
|
| 398 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.eval"
|
| 399 |
+
},
|
| 400 |
+
"agentharm": {
|
| 401 |
+
"avg_score": 0.14767992424242424,
|
| 402 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T08-05-14-08-00_agentharm_VJGhWKLrVLdQczBZVgCXHc.eval"
|
| 403 |
+
},
|
| 404 |
+
"agentharm_benign": {
|
| 405 |
+
"avg_score": 0.800704570051161,
|
| 406 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T15-09-48-08-00_agentharm-benign_A3uBBWNvv88P5BsgqwFCfg.eval"
|
| 407 |
+
},
|
| 408 |
+
"swe_bench": {
|
| 409 |
+
"mean": 0.0672,
|
| 410 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T18-56-55+00-00_anthropic-claude-3-5-sonnet.eval"
|
| 411 |
+
}
|
| 412 |
+
}
|
| 413 |
+
},
|
| 414 |
+
"gemini-1.5-flash": {
|
| 415 |
+
"config": {
|
| 416 |
+
"model_name": "gemini-1.5-flash",
|
| 417 |
+
"model_sha": "https://deepmind.google/technologies/gemini/flash",
|
| 418 |
+
"model_dtype": "torch.float16"
|
| 419 |
+
},
|
| 420 |
+
"results": {
|
| 421 |
+
"gpqa_diamond": {
|
| 422 |
+
"accuracy": 0.40404040404040403,
|
| 423 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.eval"
|
| 424 |
+
},
|
| 425 |
+
"arc_challenge": {
|
| 426 |
+
"accuracy": 0.9308873720136519,
|
| 427 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.eval"
|
| 428 |
+
},
|
| 429 |
+
"math": {
|
| 430 |
+
"accuracy": 0.452,
|
| 431 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.eval"
|
| 432 |
+
},
|
| 433 |
+
"mmmu_open": {
|
| 434 |
+
"accuracy": 0.16981132075471697,
|
| 435 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.eval"
|
| 436 |
+
},
|
| 437 |
+
"drop": {
|
| 438 |
+
"mean": 0.751044572627163,
|
| 439 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.eval"
|
| 440 |
+
},
|
| 441 |
+
"mmlu_pro": {
|
| 442 |
+
"accuracy": 0.5993184840425532,
|
| 443 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.eval"
|
| 444 |
+
},
|
| 445 |
+
"ifeval": {
|
| 446 |
+
"final_acc": 0.7681296737102001,
|
| 447 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.eval"
|
| 448 |
+
},
|
| 449 |
+
"hellaswag": {
|
| 450 |
+
"accuracy": 0.8557060346544513,
|
| 451 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.eval"
|
| 452 |
+
},
|
| 453 |
+
"winogrande": {
|
| 454 |
+
"accuracy": 0.7884767166535123,
|
| 455 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.eval"
|
| 456 |
+
},
|
| 457 |
+
"humaneval": {
|
| 458 |
+
"mean": 0.7439024390243902,
|
| 459 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.eval"
|
| 460 |
+
},
|
| 461 |
+
"arc_easy": {
|
| 462 |
+
"accuracy": 0.984006734006734,
|
| 463 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.eval"
|
| 464 |
+
},
|
| 465 |
+
"gsm8k": {
|
| 466 |
+
"accuracy": 0.8582259287338894,
|
| 467 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.eval"
|
| 468 |
+
},
|
| 469 |
+
"mmlu": {
|
| 470 |
+
"accuracy": 0.7714713003845606,
|
| 471 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.eval"
|
| 472 |
+
},
|
| 473 |
+
"mmmu_multiple_choice": {
|
| 474 |
+
"accuracy": 0.5702479338842975,
|
| 475 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.eval"
|
| 476 |
+
},
|
| 477 |
+
"gaia": {
|
| 478 |
+
"accuracy": null,
|
| 479 |
+
"log_url": null
|
| 480 |
+
},
|
| 481 |
+
"gdm_intercode_ctf": {
|
| 482 |
+
"accuracy": null,
|
| 483 |
+
"log_url": null
|
| 484 |
+
},
|
| 485 |
+
"gdm_in_house_ctf": {
|
| 486 |
+
"accuracy": null,
|
| 487 |
+
"log_url": null
|
| 488 |
+
},
|
| 489 |
+
"agentharm": {
|
| 490 |
+
"avg_score": null,
|
| 491 |
+
"log_url": null
|
| 492 |
+
},
|
| 493 |
+
"agentharm_benign": {
|
| 494 |
+
"avg_score": null,
|
| 495 |
+
"log_url": null
|
| 496 |
+
},
|
| 497 |
+
"swe_bench": {
|
| 498 |
+
"mean": null,
|
| 499 |
+
"log_url": null
|
| 500 |
+
}
|
| 501 |
+
}
|
| 502 |
+
},
|
| 503 |
+
"gemini-1.5-pro": {
|
| 504 |
+
"config": {
|
| 505 |
+
"model_name": "gemini-1.5-pro",
|
| 506 |
+
"model_sha": "https://deepmind.google/technologies/gemini/pro",
|
| 507 |
+
"model_dtype": "torch.float16"
|
| 508 |
+
},
|
| 509 |
+
"results": {
|
| 510 |
+
"mmlu": {
|
| 511 |
+
"accuracy": 0.8467454778521578,
|
| 512 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.eval"
|
| 513 |
+
},
|
| 514 |
+
"humaneval": {
|
| 515 |
+
"mean": 0.8719512195121951,
|
| 516 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.eval"
|
| 517 |
+
},
|
| 518 |
+
"mmmu_multiple_choice": {
|
| 519 |
+
"accuracy": 0.6304604486422668,
|
| 520 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2025-01-20T23-16-04-05-00_mmmu-multiple-choice_NLmxmHYt6CJymRVVa5UsbD.eval"
|
| 521 |
+
},
|
| 522 |
+
"mmlu_pro": {
|
| 523 |
+
"accuracy": 0.7563996010638298,
|
| 524 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.eval"
|
| 525 |
+
},
|
| 526 |
+
"math": {
|
| 527 |
+
"accuracy": 0.852,
|
| 528 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.eval"
|
| 529 |
+
},
|
| 530 |
+
"arc_easy": {
|
| 531 |
+
"accuracy": 0.9877946127946128,
|
| 532 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.eval"
|
| 533 |
+
},
|
| 534 |
+
"mmmu_open": {
|
| 535 |
+
"accuracy": 0.3584905660377358,
|
| 536 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2025-01-20T23-19-25-05-00_mmmu-open_CDbtEQ7tjs5zkj4ScBbzod.eval"
|
| 537 |
+
},
|
| 538 |
+
"gsm8k": {
|
| 539 |
+
"accuracy": 0.9613343442001516,
|
| 540 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.eval"
|
| 541 |
+
},
|
| 542 |
+
"gpqa_diamond": {
|
| 543 |
+
"accuracy": 0.5782828282828283,
|
| 544 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.eval"
|
| 545 |
+
},
|
| 546 |
+
"ifeval": {
|
| 547 |
+
"final_acc": 0.8982344623377084,
|
| 548 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.eval"
|
| 549 |
+
},
|
| 550 |
+
"winogrande": {
|
| 551 |
+
"accuracy": 0.8768745067087609,
|
| 552 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-40-46-05-00_winogrande_5SmD6rx47zmZvHHkQSSfHK.eval"
|
| 553 |
+
},
|
| 554 |
+
"arc_challenge": {
|
| 555 |
+
"accuracy": 0.9633105802047781,
|
| 556 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.eval"
|
| 557 |
+
},
|
| 558 |
+
"drop": {
|
| 559 |
+
"mean": 0.8800912427897221,
|
| 560 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.eval"
|
| 561 |
+
},
|
| 562 |
+
"hellaswag": {
|
| 563 |
+
"accuracy": 0.9123680541724756,
|
| 564 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.eval"
|
| 565 |
+
},
|
| 566 |
+
"gaia": {
|
| 567 |
+
"accuracy": 0.13818181818181818,
|
| 568 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.eval"
|
| 569 |
+
},
|
| 570 |
+
"gdm_intercode_ctf": {
|
| 571 |
+
"accuracy": 0.5291139240506328,
|
| 572 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.eval"
|
| 573 |
+
},
|
| 574 |
+
"gdm_in_house_ctf": {
|
| 575 |
+
"accuracy": 0.23076923076923078,
|
| 576 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.eval"
|
| 577 |
+
},
|
| 578 |
+
"agentharm": {
|
| 579 |
+
"avg_score": 0.2898649645808737,
|
| 580 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T12-45-43-08-00_agentharm_VmD26soLwmRgWPo3hpRHBr.eval"
|
| 581 |
+
},
|
| 582 |
+
"agentharm_benign": {
|
| 583 |
+
"avg_score": 0.5961489079102715,
|
| 584 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T13-18-51-08-00_agentharm-benign_gP3pQPxAuCtFLiHzt2Egt7.eval"
|
| 585 |
+
},
|
| 586 |
+
"swe_bench": {
|
| 587 |
+
"mean": 0.004,
|
| 588 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-22T03-00-08+00-00_google-gemini-1.5-pro_swe.eval"
|
| 589 |
+
}
|
| 590 |
+
}
|
| 591 |
+
},
|
| 592 |
+
"gpt-4o": {
|
| 593 |
+
"config": {
|
| 594 |
+
"model_name": "gpt-4o",
|
| 595 |
+
"model_sha": "https://openai.com/index/hello-gpt-4o",
|
| 596 |
+
"model_dtype": "torch.float16"
|
| 597 |
+
},
|
| 598 |
+
"results": {
|
| 599 |
+
"gpqa_diamond": {
|
| 600 |
+
"accuracy": 0.51010101010101,
|
| 601 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-29-33-04-00_gpqa-diamond_nFmRv5MJiYjHjezmq4V6Va.eval"
|
| 602 |
+
},
|
| 603 |
+
"arc_challenge": {
|
| 604 |
+
"accuracy": 0.9633105802047781,
|
| 605 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-45-55-04-00_arc-challenge_nrsPPxh4DpzgLPQDFdcfVp.eval"
|
| 606 |
+
},
|
| 607 |
+
"gsm8k": {
|
| 608 |
+
"accuracy": 0.9446550416982562,
|
| 609 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-31-16-04-00_gsm8k_jVXeSvHowbietZCFsFYCwB.eval"
|
| 610 |
+
},
|
| 611 |
+
"mmlu": {
|
| 612 |
+
"accuracy": 0.8435408061529697,
|
| 613 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_GarLpfQFSpM3C22nbbGp54.eval"
|
| 614 |
+
},
|
| 615 |
+
"ifeval": {
|
| 616 |
+
"final_acc": 0.8780386042367585,
|
| 617 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T05-00-11-04-00_ifeval_jxreUu8JqRdkrcHP4E3hLR.eval"
|
| 618 |
+
},
|
| 619 |
+
"mmlu_pro": {
|
| 620 |
+
"accuracy": 0.7450964095744681,
|
| 621 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T06-59-42-04-00_mmlu-pro_EuAKDwAWSfNVpqyyqrf2Ba.eval"
|
| 622 |
+
},
|
| 623 |
+
"mmmu_open": {
|
| 624 |
+
"accuracy": 0.3584905660377358,
|
| 625 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2025-01-20T23-07-46-05-00_mmmu-open_d3Q2HvuPZzEX6FAM4NBhnp.eval"
|
| 626 |
+
},
|
| 627 |
+
"winogrande": {
|
| 628 |
+
"accuracy": 0.9013417521704814,
|
| 629 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T09-02-03-04-00_winogrande_44kKF7M9mKoqVC7ixZVXuq.eval"
|
| 630 |
+
},
|
| 631 |
+
"drop": {
|
| 632 |
+
"mean": 0.7511693759832198,
|
| 633 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-47-20-04-00_drop_3gxDcn6vUoR3nvHX9BcSq4.eval"
|
| 634 |
+
},
|
| 635 |
+
"arc_easy": {
|
| 636 |
+
"accuracy": 0.9915824915824916,
|
| 637 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-41-34-04-00_arc-easy_nUavRHdiRVfrxo6dmCPadh.eval"
|
| 638 |
+
},
|
| 639 |
+
"mmmu_multiple_choice": {
|
| 640 |
+
"accuracy": 0.5903187721369539,
|
| 641 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.eval"
|
| 642 |
+
},
|
| 643 |
+
"humaneval": {
|
| 644 |
+
"mean": 0.9085365853658537,
|
| 645 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.eval"
|
| 646 |
+
},
|
| 647 |
+
"math": {
|
| 648 |
+
"accuracy": 0.7054,
|
| 649 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.eval"
|
| 650 |
+
},
|
| 651 |
+
"hellaswag": {
|
| 652 |
+
"accuracy": 0.924317864967138,
|
| 653 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.eval"
|
| 654 |
+
},
|
| 655 |
+
"gaia": {
|
| 656 |
+
"accuracy": 0.16606060606060608,
|
| 657 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.eval"
|
| 658 |
+
},
|
| 659 |
+
"gdm_intercode_ctf": {
|
| 660 |
+
"accuracy": 0.6379746835443038,
|
| 661 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.eval"
|
| 662 |
+
},
|
| 663 |
+
"gdm_in_house_ctf": {
|
| 664 |
+
"accuracy": 0.23076923076923078,
|
| 665 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.eval"
|
| 666 |
+
},
|
| 667 |
+
"agentharm": {
|
| 668 |
+
"avg_score": 0.49953844451003543,
|
| 669 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-07T16-34-15-08-00_agentharm_UfSoyHEAH2E5RVdrPVUemy.eval"
|
| 670 |
+
},
|
| 671 |
+
"agentharm_benign": {
|
| 672 |
+
"avg_score": 0.8249433048012594,
|
| 673 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-21T13-45-18-08-00_agentharm-benign_8DhGJqEAvw6o8uCv4a4dVz.eval"
|
| 674 |
+
},
|
| 675 |
+
"swe_bench": {
|
| 676 |
+
"mean": 0.012,
|
| 677 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-14T23-09-10+00-00_openai-gpt-4o_swe.eval"
|
| 678 |
+
}
|
| 679 |
+
}
|
| 680 |
+
},
|
| 681 |
+
"gpt-4o-mini": {
|
| 682 |
+
"config": {
|
| 683 |
+
"model_name": "gpt-4o-mini",
|
| 684 |
+
"model_sha": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
| 685 |
+
"model_dtype": "torch.float16"
|
| 686 |
+
},
|
| 687 |
+
"results": {
|
| 688 |
+
"drop": {
|
| 689 |
+
"mean": 0.8065915049816466,
|
| 690 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.eval"
|
| 691 |
+
},
|
| 692 |
+
"humaneval": {
|
| 693 |
+
"mean": 0.8597560975609756,
|
| 694 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.eval"
|
| 695 |
+
},
|
| 696 |
+
"gpqa_diamond": {
|
| 697 |
+
"accuracy": 0.3838383838383838,
|
| 698 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.eval"
|
| 699 |
+
},
|
| 700 |
+
"mmmu_open": {
|
| 701 |
+
"accuracy": 0.18867924528301888,
|
| 702 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.eval"
|
| 703 |
+
},
|
| 704 |
+
"arc_challenge": {
|
| 705 |
+
"accuracy": 0.9249146757679181,
|
| 706 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.eval"
|
| 707 |
+
},
|
| 708 |
+
"mmlu": {
|
| 709 |
+
"accuracy": 0.7698333570716422,
|
| 710 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.eval"
|
| 711 |
+
},
|
| 712 |
+
"hellaswag": {
|
| 713 |
+
"accuracy": 0.8750248954391555,
|
| 714 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.eval"
|
| 715 |
+
},
|
| 716 |
+
"ifeval": {
|
| 717 |
+
"final_acc": 0.8419061423689144,
|
| 718 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.eval"
|
| 719 |
+
},
|
| 720 |
+
"mmmu_multiple_choice": {
|
| 721 |
+
"accuracy": 0.5395513577331759,
|
| 722 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.eval"
|
| 723 |
+
},
|
| 724 |
+
"arc_easy": {
|
| 725 |
+
"accuracy": 0.9793771043771043,
|
| 726 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.eval"
|
| 727 |
+
},
|
| 728 |
+
"winogrande": {
|
| 729 |
+
"accuracy": 0.7529597474348856,
|
| 730 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.eval"
|
| 731 |
+
},
|
| 732 |
+
"mmlu_pro": {
|
| 733 |
+
"accuracy": 0.6396276595744681,
|
| 734 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.eval"
|
| 735 |
+
},
|
| 736 |
+
"math": {
|
| 737 |
+
"accuracy": 0.633,
|
| 738 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.eval"
|
| 739 |
+
},
|
| 740 |
+
"gsm8k": {
|
| 741 |
+
"accuracy": 0.9181197877179682,
|
| 742 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.eval"
|
| 743 |
+
},
|
| 744 |
+
"gaia": {
|
| 745 |
+
"accuracy": null,
|
| 746 |
+
"log_url": null
|
| 747 |
+
},
|
| 748 |
+
"gdm_intercode_ctf": {
|
| 749 |
+
"accuracy": null,
|
| 750 |
+
"log_url": null
|
| 751 |
+
},
|
| 752 |
+
"gdm_in_house_ctf": {
|
| 753 |
+
"accuracy": null,
|
| 754 |
+
"log_url": null
|
| 755 |
+
},
|
| 756 |
+
"agentharm": {
|
| 757 |
+
"avg_score": null,
|
| 758 |
+
"log_url": null
|
| 759 |
+
},
|
| 760 |
+
"agentharm_benign": {
|
| 761 |
+
"avg_score": null,
|
| 762 |
+
"log_url": null
|
| 763 |
+
},
|
| 764 |
+
"swe_bench": {
|
| 765 |
+
"mean": null,
|
| 766 |
+
"log_url": null
|
| 767 |
+
}
|
| 768 |
+
}
|
| 769 |
+
},
|
| 770 |
+
"o1": {
|
| 771 |
+
"config": {
|
| 772 |
+
"model_name": "o1",
|
| 773 |
+
"model_sha": "https://openai.com/o1",
|
| 774 |
+
"model_dtype": "torch.float16"
|
| 775 |
+
},
|
| 776 |
+
"results": {
|
| 777 |
+
"winogrande": {
|
| 778 |
+
"accuracy": 0.9392265193370166,
|
| 779 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.eval"
|
| 780 |
+
},
|
| 781 |
+
"humaneval": {
|
| 782 |
+
"mean": 0.9695121951219512,
|
| 783 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.eval"
|
| 784 |
+
},
|
| 785 |
+
"mmmu_open": {
|
| 786 |
+
"accuracy": 0.6981132075471698,
|
| 787 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.eval"
|
| 788 |
+
},
|
| 789 |
+
"math": {
|
| 790 |
+
"accuracy": 0.959,
|
| 791 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.eval"
|
| 792 |
+
},
|
| 793 |
+
"arc_easy": {
|
| 794 |
+
"accuracy": 0.9911616161616161,
|
| 795 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.eval"
|
| 796 |
+
},
|
| 797 |
+
"arc_challenge": {
|
| 798 |
+
"accuracy": 0.9786689419795221,
|
| 799 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.eval"
|
| 800 |
+
},
|
| 801 |
+
"gsm8k": {
|
| 802 |
+
"accuracy": 0.9416224412433661,
|
| 803 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.eval"
|
| 804 |
+
},
|
| 805 |
+
"gpqa_diamond": {
|
| 806 |
+
"accuracy": 0.7550505050505051,
|
| 807 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.eval"
|
| 808 |
+
},
|
| 809 |
+
"mmlu_pro": {
|
| 810 |
+
"accuracy": 0.8447473404255319,
|
| 811 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.eval"
|
| 812 |
+
},
|
| 813 |
+
"mmmu_multiple_choice": {
|
| 814 |
+
"accuracy": 0.8063754427390791,
|
| 815 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.eval"
|
| 816 |
+
},
|
| 817 |
+
"drop": {
|
| 818 |
+
"mean": null,
|
| 819 |
+
"log_url": null
|
| 820 |
+
},
|
| 821 |
+
"hellaswag": {
|
| 822 |
+
"accuracy": null,
|
| 823 |
+
"log_url": null
|
| 824 |
+
},
|
| 825 |
+
"ifeval": {
|
| 826 |
+
"final_acc": null,
|
| 827 |
+
"log_url": null
|
| 828 |
+
},
|
| 829 |
+
"mmlu": {
|
| 830 |
+
"accuracy": null,
|
| 831 |
+
"log_url": null
|
| 832 |
+
},
|
| 833 |
+
"gaia": {
|
| 834 |
+
"accuracy": 0.41090909090909084,
|
| 835 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T13-42-00-05-00_o1_gaia_merged.eval"
|
| 836 |
+
},
|
| 837 |
+
"gdm_intercode_ctf": {
|
| 838 |
+
"accuracy": 0.8481012658227849,
|
| 839 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.eval"
|
| 840 |
+
},
|
| 841 |
+
"gdm_in_house_ctf": {
|
| 842 |
+
"accuracy": 0.46153846153846156,
|
| 843 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.eval"
|
| 844 |
+
},
|
| 845 |
+
"agentharm": {
|
| 846 |
+
"avg_score": 0.08782061688311688,
|
| 847 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T09-05-42-08-00_agentharm_UGDq2yJeLAnPH6p7FgDgD8.eval"
|
| 848 |
+
},
|
| 849 |
+
"agentharm_benign": {
|
| 850 |
+
"avg_score": 0.7235176849665487,
|
| 851 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T18-20-15-08-00_agentharm-benign_bkW2Bf5xLyDQdNtfLdjCpJ.eval"
|
| 852 |
+
},
|
| 853 |
+
"swe_bench": {
|
| 854 |
+
"mean": 0.0036,
|
| 855 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T17-42-11+00-00_openai-o1_swe.eval "
|
| 856 |
+
}
|
| 857 |
+
}
|
| 858 |
+
},
|
| 859 |
+
"o3-mini": {
|
| 860 |
+
"config": {
|
| 861 |
+
"model_name": "o3-mini",
|
| 862 |
+
"model_sha": "https://openai.com/index/openai-o3-mini",
|
| 863 |
+
"model_dtype": "torch.float16"
|
| 864 |
+
},
|
| 865 |
+
"results": {
|
| 866 |
+
"math": {
|
| 867 |
+
"accuracy": 0.9691320905993185,
|
| 868 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.eval"
|
| 869 |
+
},
|
| 870 |
+
"humaneval": {
|
| 871 |
+
"mean": 0.9817073170731707,
|
| 872 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.eval"
|
| 873 |
+
},
|
| 874 |
+
"mmlu_pro": {
|
| 875 |
+
"accuracy": 0.7924606807023383,
|
| 876 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.eval"
|
| 877 |
+
},
|
| 878 |
+
"gpqa_diamond": {
|
| 879 |
+
"accuracy": 0.7365319865319865,
|
| 880 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.eval"
|
| 881 |
+
},
|
| 882 |
+
"winogrande": {
|
| 883 |
+
"accuracy": 0.8492501973164956,
|
| 884 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.eval"
|
| 885 |
+
},
|
| 886 |
+
"gsm8k": {
|
| 887 |
+
"accuracy": 0.9454131918119788,
|
| 888 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.eval"
|
| 889 |
+
},
|
| 890 |
+
"arc_challenge": {
|
| 891 |
+
"accuracy": 0.9641638225255973,
|
| 892 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.eval"
|
| 893 |
+
},
|
| 894 |
+
"arc_easy": {
|
| 895 |
+
"accuracy": 0.9755892255892256,
|
| 896 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.eval"
|
| 897 |
+
},
|
| 898 |
+
"drop": {
|
| 899 |
+
"mean": null,
|
| 900 |
+
"log_url": null
|
| 901 |
+
},
|
| 902 |
+
"hellaswag": {
|
| 903 |
+
"accuracy": null,
|
| 904 |
+
"log_url": null
|
| 905 |
+
},
|
| 906 |
+
"ifeval": {
|
| 907 |
+
"final_acc": null,
|
| 908 |
+
"log_url": null
|
| 909 |
+
},
|
| 910 |
+
"mmlu": {
|
| 911 |
+
"accuracy": null,
|
| 912 |
+
"log_url": null
|
| 913 |
+
},
|
| 914 |
+
"mmmu_multiple_choice": {
|
| 915 |
+
"accuracy": null,
|
| 916 |
+
"log_url": null
|
| 917 |
+
},
|
| 918 |
+
"mmmu_open": {
|
| 919 |
+
"accuracy": null,
|
| 920 |
+
"log_url": null
|
| 921 |
+
},
|
| 922 |
+
"gaia": {
|
| 923 |
+
"accuracy": 0.27030303030303043,
|
| 924 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.eval"
|
| 925 |
+
},
|
| 926 |
+
"gdm_intercode_ctf": {
|
| 927 |
+
"accuracy": 0.8278481012658225,
|
| 928 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.eval"
|
| 929 |
+
},
|
| 930 |
+
"gdm_in_house_ctf": {
|
| 931 |
+
"accuracy": 0.38461538461538464,
|
| 932 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.eval"
|
| 933 |
+
},
|
| 934 |
+
"agentharm": {
|
| 935 |
+
"avg_score": 0.1241931080283353,
|
| 936 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.eval"
|
| 937 |
+
},
|
| 938 |
+
"agentharm_benign": {
|
| 939 |
+
"avg_score": 0.5429306867375049,
|
| 940 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.eval"
|
| 941 |
+
},
|
| 942 |
+
"swe_bench": {
|
| 943 |
+
"mean": 0.0024,
|
| 944 |
+
"log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T06-49-09+00-00_openai-o3-mini_swe.eval"
|
| 945 |
+
}
|
| 946 |
+
}
|
| 947 |
+
}
|
| 948 |
+
}
|
data/results.json.bak
ADDED
|
@@ -0,0 +1,760 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"DeepSeek-R1": {
|
| 3 |
+
"config": {
|
| 4 |
+
"model_name": "DeepSeek-R1",
|
| 5 |
+
"model_sha": "https://api-docs.deepseek.com/news/news250120",
|
| 6 |
+
"model_dtype": "torch.float16"
|
| 7 |
+
},
|
| 8 |
+
"results": {
|
| 9 |
+
"mmlu_pro": {
|
| 10 |
+
"accuracy": 0.8382646276595744
|
| 11 |
+
},
|
| 12 |
+
"humaneval": {
|
| 13 |
+
"mean": 0.9567901234567902
|
| 14 |
+
},
|
| 15 |
+
"math": {
|
| 16 |
+
"accuracy": 0.9272
|
| 17 |
+
},
|
| 18 |
+
"gsm8k": {
|
| 19 |
+
"accuracy": 0.954510993176649
|
| 20 |
+
},
|
| 21 |
+
"arc_challenge": {
|
| 22 |
+
"accuracy": 0.9667235494880546
|
| 23 |
+
},
|
| 24 |
+
"winogrande": {
|
| 25 |
+
"accuracy": 0.9179163378058406
|
| 26 |
+
},
|
| 27 |
+
"arc_easy": {
|
| 28 |
+
"accuracy": 0.9873737373737373
|
| 29 |
+
},
|
| 30 |
+
"gpqa_diamond": {
|
| 31 |
+
"accuracy": 0.7045454545454546
|
| 32 |
+
},
|
| 33 |
+
"drop": {
|
| 34 |
+
"mean": null
|
| 35 |
+
},
|
| 36 |
+
"hellaswag": {
|
| 37 |
+
"accuracy": null
|
| 38 |
+
},
|
| 39 |
+
"ifeval": {
|
| 40 |
+
"final_acc": null
|
| 41 |
+
},
|
| 42 |
+
"mmlu": {
|
| 43 |
+
"accuracy": null
|
| 44 |
+
},
|
| 45 |
+
"mmmu_multiple_choice": {
|
| 46 |
+
"accuracy": null
|
| 47 |
+
},
|
| 48 |
+
"mmmu_open": {
|
| 49 |
+
"accuracy": null
|
| 50 |
+
},
|
| 51 |
+
"gaia": {
|
| 52 |
+
"accuracy": null
|
| 53 |
+
},
|
| 54 |
+
"gdm_intercode_ctf": {
|
| 55 |
+
"accuracy": null
|
| 56 |
+
},
|
| 57 |
+
"gdm_in_house_ctf": {
|
| 58 |
+
"accuracy": null
|
| 59 |
+
},
|
| 60 |
+
"agentharm": {
|
| 61 |
+
"avg_score": null
|
| 62 |
+
},
|
| 63 |
+
"agentharm_benign": {
|
| 64 |
+
"avg_score": null
|
| 65 |
+
},
|
| 66 |
+
"swe_bench": {
|
| 67 |
+
"mean": null
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
},
|
| 71 |
+
"Meta-Llama-3.1-70B-Instruct": {
|
| 72 |
+
"config": {
|
| 73 |
+
"model_name": "Meta-Llama-3.1-70B-Instruct",
|
| 74 |
+
"model_sha": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
|
| 75 |
+
"model_dtype": "torch.float16"
|
| 76 |
+
},
|
| 77 |
+
"results": {
|
| 78 |
+
"hellaswag": {
|
| 79 |
+
"accuracy": 0.869946225851424
|
| 80 |
+
},
|
| 81 |
+
"drop": {
|
| 82 |
+
"mean": 0.8811263765076035
|
| 83 |
+
},
|
| 84 |
+
"gpqa_diamond": {
|
| 85 |
+
"accuracy": 0.4318181818181818
|
| 86 |
+
},
|
| 87 |
+
"winogrande": {
|
| 88 |
+
"accuracy": 0.8666140489344909
|
| 89 |
+
},
|
| 90 |
+
"gsm8k": {
|
| 91 |
+
"accuracy": 0.9469294920394238
|
| 92 |
+
},
|
| 93 |
+
"math": {
|
| 94 |
+
"accuracy": 0.6004
|
| 95 |
+
},
|
| 96 |
+
"ifeval": {
|
| 97 |
+
"final_acc": 0.8604907201780166
|
| 98 |
+
},
|
| 99 |
+
"arc_challenge": {
|
| 100 |
+
"accuracy": 0.9445392491467577
|
| 101 |
+
},
|
| 102 |
+
"arc_easy": {
|
| 103 |
+
"accuracy": 0.9823232323232324
|
| 104 |
+
},
|
| 105 |
+
"mmlu_pro": {
|
| 106 |
+
"accuracy": 0.6688829787234043
|
| 107 |
+
},
|
| 108 |
+
"humaneval": {
|
| 109 |
+
"mean": 0.7865853658536586
|
| 110 |
+
},
|
| 111 |
+
"mmlu": {
|
| 112 |
+
"accuracy": 0.8033755875231449
|
| 113 |
+
},
|
| 114 |
+
"mmmu_multiple_choice": {
|
| 115 |
+
"accuracy": null
|
| 116 |
+
},
|
| 117 |
+
"mmmu_open": {
|
| 118 |
+
"accuracy": null
|
| 119 |
+
},
|
| 120 |
+
"gaia": {
|
| 121 |
+
"accuracy": null
|
| 122 |
+
},
|
| 123 |
+
"gdm_intercode_ctf": {
|
| 124 |
+
"accuracy": null
|
| 125 |
+
},
|
| 126 |
+
"gdm_in_house_ctf": {
|
| 127 |
+
"accuracy": null
|
| 128 |
+
},
|
| 129 |
+
"agentharm": {
|
| 130 |
+
"avg_score": null
|
| 131 |
+
},
|
| 132 |
+
"agentharm_benign": {
|
| 133 |
+
"avg_score": null
|
| 134 |
+
},
|
| 135 |
+
"swe_bench": {
|
| 136 |
+
"mean": null
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
},
|
| 140 |
+
"Mistral-Large-Instruct-2407": {
|
| 141 |
+
"config": {
|
| 142 |
+
"model_name": "Mistral-Large-Instruct-2407",
|
| 143 |
+
"model_sha": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
|
| 144 |
+
"model_dtype": "torch.float16"
|
| 145 |
+
},
|
| 146 |
+
"results": {
|
| 147 |
+
"drop": {
|
| 148 |
+
"mean": 0.7424257996853698
|
| 149 |
+
},
|
| 150 |
+
"ifeval": {
|
| 151 |
+
"final_acc": 0.8285172231900246
|
| 152 |
+
},
|
| 153 |
+
"mmlu": {
|
| 154 |
+
"accuracy": 0.8035892323030908
|
| 155 |
+
},
|
| 156 |
+
"gpqa_diamond": {
|
| 157 |
+
"accuracy": 0.4734848484848485
|
| 158 |
+
},
|
| 159 |
+
"gsm8k": {
|
| 160 |
+
"accuracy": 0.9378316906747536
|
| 161 |
+
},
|
| 162 |
+
"math": {
|
| 163 |
+
"accuracy": 0.6574
|
| 164 |
+
},
|
| 165 |
+
"arc_easy": {
|
| 166 |
+
"accuracy": 0.9852693602693603
|
| 167 |
+
},
|
| 168 |
+
"mmlu_pro": {
|
| 169 |
+
"accuracy": 0.6942320478723404
|
| 170 |
+
},
|
| 171 |
+
"humaneval": {
|
| 172 |
+
"mean": 0.8658536585365854
|
| 173 |
+
},
|
| 174 |
+
"hellaswag": {
|
| 175 |
+
"accuracy": 0.9047998406691894
|
| 176 |
+
},
|
| 177 |
+
"arc_challenge": {
|
| 178 |
+
"accuracy": 0.9436860068259386
|
| 179 |
+
},
|
| 180 |
+
"winogrande": {
|
| 181 |
+
"accuracy": 0.8547750591949487
|
| 182 |
+
},
|
| 183 |
+
"mmmu_multiple_choice": {
|
| 184 |
+
"accuracy": null
|
| 185 |
+
},
|
| 186 |
+
"mmmu_open": {
|
| 187 |
+
"accuracy": null
|
| 188 |
+
},
|
| 189 |
+
"gaia": {
|
| 190 |
+
"accuracy": null
|
| 191 |
+
},
|
| 192 |
+
"gdm_intercode_ctf": {
|
| 193 |
+
"accuracy": null
|
| 194 |
+
},
|
| 195 |
+
"gdm_in_house_ctf": {
|
| 196 |
+
"accuracy": null
|
| 197 |
+
},
|
| 198 |
+
"agentharm": {
|
| 199 |
+
"avg_score": null
|
| 200 |
+
},
|
| 201 |
+
"agentharm_benign": {
|
| 202 |
+
"avg_score": null
|
| 203 |
+
},
|
| 204 |
+
"swe_bench": {
|
| 205 |
+
"mean": null
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
},
|
| 209 |
+
"c4ai-command-r-plus": {
|
| 210 |
+
"config": {
|
| 211 |
+
"model_name": "c4ai-command-r-plus",
|
| 212 |
+
"model_sha": "https://huggingface.co/CohereForAI/c4ai-command-r-plus"
|
| 213 |
+
},
|
| 214 |
+
"results": {
|
| 215 |
+
"ifeval": {
|
| 216 |
+
"final_acc": 0.7779591483929307
|
| 217 |
+
},
|
| 218 |
+
"winogrande": {
|
| 219 |
+
"accuracy": 0.7490134175217048
|
| 220 |
+
},
|
| 221 |
+
"arc_challenge": {
|
| 222 |
+
"accuracy": 0.8506825938566553
|
| 223 |
+
},
|
| 224 |
+
"drop": {
|
| 225 |
+
"mean": 0.743557420031463
|
| 226 |
+
},
|
| 227 |
+
"math": {
|
| 228 |
+
"accuracy": 0.2626
|
| 229 |
+
},
|
| 230 |
+
"gpqa_diamond": {
|
| 231 |
+
"accuracy": 0.3194444444444444
|
| 232 |
+
},
|
| 233 |
+
"mmlu_pro": {
|
| 234 |
+
"accuracy": 0.441156914893617
|
| 235 |
+
},
|
| 236 |
+
"humaneval": {
|
| 237 |
+
"mean": 0.6219512195121951
|
| 238 |
+
},
|
| 239 |
+
"gsm8k": {
|
| 240 |
+
"accuracy": 0.7816527672479151
|
| 241 |
+
},
|
| 242 |
+
"hellaswag": {
|
| 243 |
+
"accuracy": 0.7954590718980283
|
| 244 |
+
},
|
| 245 |
+
"mmlu": {
|
| 246 |
+
"accuracy": 0.695128899017234
|
| 247 |
+
},
|
| 248 |
+
"arc_easy": {
|
| 249 |
+
"accuracy": 0.9377104377104377
|
| 250 |
+
},
|
| 251 |
+
"mmmu_multiple_choice": {
|
| 252 |
+
"accuracy": null
|
| 253 |
+
},
|
| 254 |
+
"mmmu_open": {
|
| 255 |
+
"accuracy": null
|
| 256 |
+
},
|
| 257 |
+
"gaia": {
|
| 258 |
+
"accuracy": null
|
| 259 |
+
},
|
| 260 |
+
"gdm_intercode_ctf": {
|
| 261 |
+
"accuracy": null
|
| 262 |
+
},
|
| 263 |
+
"gdm_in_house_ctf": {
|
| 264 |
+
"accuracy": null
|
| 265 |
+
},
|
| 266 |
+
"agentharm": {
|
| 267 |
+
"avg_score": null
|
| 268 |
+
},
|
| 269 |
+
"agentharm_benign": {
|
| 270 |
+
"avg_score": null
|
| 271 |
+
},
|
| 272 |
+
"swe_bench": {
|
| 273 |
+
"mean": null
|
| 274 |
+
}
|
| 275 |
+
}
|
| 276 |
+
},
|
| 277 |
+
"claude-3-5-sonnet-20241022": {
|
| 278 |
+
"config": {
|
| 279 |
+
"model_name": "claude-3-5-sonnet-20241022",
|
| 280 |
+
"model_sha": "https://www.anthropic.com/claude/sonnet",
|
| 281 |
+
"model_dtype": "torch.float16"
|
| 282 |
+
},
|
| 283 |
+
"results": {
|
| 284 |
+
"mmmu_multiple_choice": {
|
| 285 |
+
"accuracy": 0.6481700118063755
|
| 286 |
+
},
|
| 287 |
+
"mmlu_pro": {
|
| 288 |
+
"accuracy": 0.7762632978723404
|
| 289 |
+
},
|
| 290 |
+
"hellaswag": {
|
| 291 |
+
"accuracy": 0.9228241386178052
|
| 292 |
+
},
|
| 293 |
+
"gpqa_diamond": {
|
| 294 |
+
"accuracy": 0.6098484848484849
|
| 295 |
+
},
|
| 296 |
+
"gsm8k": {
|
| 297 |
+
"accuracy": 0.9620924943138741
|
| 298 |
+
},
|
| 299 |
+
"mmmu_open": {
|
| 300 |
+
"accuracy": 0.41509433962264153
|
| 301 |
+
},
|
| 302 |
+
"arc_easy": {
|
| 303 |
+
"accuracy": 0.9915824915824916
|
| 304 |
+
},
|
| 305 |
+
"arc_challenge": {
|
| 306 |
+
"accuracy": 0.9692832764505119
|
| 307 |
+
},
|
| 308 |
+
"mmlu": {
|
| 309 |
+
"accuracy": 0.8665432274604757
|
| 310 |
+
},
|
| 311 |
+
"math": {
|
| 312 |
+
"accuracy": 0.7942
|
| 313 |
+
},
|
| 314 |
+
"ifeval": {
|
| 315 |
+
"final_acc": 0.8958114469607309
|
| 316 |
+
},
|
| 317 |
+
"humaneval": {
|
| 318 |
+
"mean": 0.9451219512195121
|
| 319 |
+
},
|
| 320 |
+
"winogrande": {
|
| 321 |
+
"accuracy": 0.9021310181531176
|
| 322 |
+
},
|
| 323 |
+
"drop": {
|
| 324 |
+
"mean": 0.8977608809648663
|
| 325 |
+
},
|
| 326 |
+
"gaia": {
|
| 327 |
+
"accuracy": 0.3381818181818182
|
| 328 |
+
},
|
| 329 |
+
"gdm_intercode_ctf": {
|
| 330 |
+
"accuracy": 0.8556962025316455
|
| 331 |
+
},
|
| 332 |
+
"gdm_in_house_ctf": {
|
| 333 |
+
"accuracy": 0.6153846153846154
|
| 334 |
+
},
|
| 335 |
+
"agentharm": {
|
| 336 |
+
"avg_score": 0.14767992424242424
|
| 337 |
+
},
|
| 338 |
+
"agentharm_benign": {
|
| 339 |
+
"avg_score": 0.800704570051161
|
| 340 |
+
},
|
| 341 |
+
"swe_bench": {
|
| 342 |
+
"mean": 0.0672
|
| 343 |
+
}
|
| 344 |
+
}
|
| 345 |
+
},
|
| 346 |
+
"gemini-1.5-flash": {
|
| 347 |
+
"config": {
|
| 348 |
+
"model_name": "gemini-1.5-flash",
|
| 349 |
+
"model_sha": "https://deepmind.google/technologies/gemini/flash",
|
| 350 |
+
"model_dtype": "torch.float16"
|
| 351 |
+
},
|
| 352 |
+
"results": {
|
| 353 |
+
"gpqa_diamond": {
|
| 354 |
+
"accuracy": 0.40404040404040403
|
| 355 |
+
},
|
| 356 |
+
"arc_challenge": {
|
| 357 |
+
"accuracy": 0.9308873720136519
|
| 358 |
+
},
|
| 359 |
+
"math": {
|
| 360 |
+
"accuracy": 0.452
|
| 361 |
+
},
|
| 362 |
+
"mmmu_open": {
|
| 363 |
+
"accuracy": 0.16981132075471697
|
| 364 |
+
},
|
| 365 |
+
"drop": {
|
| 366 |
+
"mean": 0.751044572627163
|
| 367 |
+
},
|
| 368 |
+
"mmlu_pro": {
|
| 369 |
+
"accuracy": 0.5993184840425532
|
| 370 |
+
},
|
| 371 |
+
"ifeval": {
|
| 372 |
+
"final_acc": 0.7681296737102001
|
| 373 |
+
},
|
| 374 |
+
"hellaswag": {
|
| 375 |
+
"accuracy": 0.8557060346544513
|
| 376 |
+
},
|
| 377 |
+
"winogrande": {
|
| 378 |
+
"accuracy": 0.7884767166535123
|
| 379 |
+
},
|
| 380 |
+
"humaneval": {
|
| 381 |
+
"mean": 0.7439024390243902
|
| 382 |
+
},
|
| 383 |
+
"arc_easy": {
|
| 384 |
+
"accuracy": 0.984006734006734
|
| 385 |
+
},
|
| 386 |
+
"gsm8k": {
|
| 387 |
+
"accuracy": 0.8582259287338894
|
| 388 |
+
},
|
| 389 |
+
"mmlu": {
|
| 390 |
+
"accuracy": 0.7714713003845606
|
| 391 |
+
},
|
| 392 |
+
"mmmu_multiple_choice": {
|
| 393 |
+
"accuracy": 0.5702479338842975
|
| 394 |
+
},
|
| 395 |
+
"gaia": {
|
| 396 |
+
"accuracy": null
|
| 397 |
+
},
|
| 398 |
+
"gdm_intercode_ctf": {
|
| 399 |
+
"accuracy": null
|
| 400 |
+
},
|
| 401 |
+
"gdm_in_house_ctf": {
|
| 402 |
+
"accuracy": null
|
| 403 |
+
},
|
| 404 |
+
"agentharm": {
|
| 405 |
+
"avg_score": null
|
| 406 |
+
},
|
| 407 |
+
"agentharm_benign": {
|
| 408 |
+
"avg_score": null
|
| 409 |
+
},
|
| 410 |
+
"swe_bench": {
|
| 411 |
+
"mean": null
|
| 412 |
+
}
|
| 413 |
+
}
|
| 414 |
+
},
|
| 415 |
+
"gemini-1.5-pro": {
|
| 416 |
+
"config": {
|
| 417 |
+
"model_name": "gemini-1.5-pro",
|
| 418 |
+
"model_sha": "https://deepmind.google/technologies/gemini/pro",
|
| 419 |
+
"model_dtype": "torch.float16"
|
| 420 |
+
},
|
| 421 |
+
"results": {
|
| 422 |
+
"mmlu": {
|
| 423 |
+
"accuracy": 0.8467454778521578
|
| 424 |
+
},
|
| 425 |
+
"humaneval": {
|
| 426 |
+
"mean": 0.8719512195121951
|
| 427 |
+
},
|
| 428 |
+
"mmmu_multiple_choice": {
|
| 429 |
+
"accuracy": 0.6304604486422668
|
| 430 |
+
},
|
| 431 |
+
"mmlu_pro": {
|
| 432 |
+
"accuracy": 0.7563996010638298
|
| 433 |
+
},
|
| 434 |
+
"math": {
|
| 435 |
+
"accuracy": 0.852
|
| 436 |
+
},
|
| 437 |
+
"arc_easy": {
|
| 438 |
+
"accuracy": 0.9877946127946128
|
| 439 |
+
},
|
| 440 |
+
"mmmu_open": {
|
| 441 |
+
"accuracy": 0.3584905660377358
|
| 442 |
+
},
|
| 443 |
+
"gsm8k": {
|
| 444 |
+
"accuracy": 0.9613343442001516
|
| 445 |
+
},
|
| 446 |
+
"gpqa_diamond": {
|
| 447 |
+
"accuracy": 0.5782828282828283
|
| 448 |
+
},
|
| 449 |
+
"ifeval": {
|
| 450 |
+
"final_acc": 0.8982344623377084
|
| 451 |
+
},
|
| 452 |
+
"winogrande": {
|
| 453 |
+
"accuracy": 0.8768745067087609
|
| 454 |
+
},
|
| 455 |
+
"arc_challenge": {
|
| 456 |
+
"accuracy": 0.9633105802047781
|
| 457 |
+
},
|
| 458 |
+
"drop": {
|
| 459 |
+
"mean": 0.8800912427897221
|
| 460 |
+
},
|
| 461 |
+
"hellaswag": {
|
| 462 |
+
"accuracy": 0.9123680541724756
|
| 463 |
+
},
|
| 464 |
+
"gaia": {
|
| 465 |
+
"accuracy": 0.13818181818181818
|
| 466 |
+
},
|
| 467 |
+
"gdm_intercode_ctf": {
|
| 468 |
+
"accuracy": 0.5291139240506328
|
| 469 |
+
},
|
| 470 |
+
"gdm_in_house_ctf": {
|
| 471 |
+
"accuracy": 0.23076923076923078
|
| 472 |
+
},
|
| 473 |
+
"agentharm": {
|
| 474 |
+
"avg_score": 0.2898649645808737
|
| 475 |
+
},
|
| 476 |
+
"agentharm_benign": {
|
| 477 |
+
"avg_score": 0.5961489079102715
|
| 478 |
+
},
|
| 479 |
+
"swe_bench": {
|
| 480 |
+
"mean": 0.004
|
| 481 |
+
}
|
| 482 |
+
}
|
| 483 |
+
},
|
| 484 |
+
"gpt-4o": {
|
| 485 |
+
"config": {
|
| 486 |
+
"model_name": "gpt-4o",
|
| 487 |
+
"model_sha": "https://openai.com/index/hello-gpt-4o",
|
| 488 |
+
"model_dtype": "torch.float16"
|
| 489 |
+
},
|
| 490 |
+
"results": {
|
| 491 |
+
"gpqa_diamond": {
|
| 492 |
+
"accuracy": 0.51010101010101
|
| 493 |
+
},
|
| 494 |
+
"arc_challenge": {
|
| 495 |
+
"accuracy": 0.9633105802047781
|
| 496 |
+
},
|
| 497 |
+
"gsm8k": {
|
| 498 |
+
"accuracy": 0.9446550416982562
|
| 499 |
+
},
|
| 500 |
+
"mmlu": {
|
| 501 |
+
"accuracy": 0.8435408061529697
|
| 502 |
+
},
|
| 503 |
+
"ifeval": {
|
| 504 |
+
"final_acc": 0.8780386042367585
|
| 505 |
+
},
|
| 506 |
+
"mmlu_pro": {
|
| 507 |
+
"accuracy": 0.7450964095744681
|
| 508 |
+
},
|
| 509 |
+
"mmmu_open": {
|
| 510 |
+
"accuracy": 0.3584905660377358
|
| 511 |
+
},
|
| 512 |
+
"winogrande": {
|
| 513 |
+
"accuracy": 0.9013417521704814
|
| 514 |
+
},
|
| 515 |
+
"drop": {
|
| 516 |
+
"mean": 0.7511693759832198
|
| 517 |
+
},
|
| 518 |
+
"arc_easy": {
|
| 519 |
+
"accuracy": 0.9915824915824916
|
| 520 |
+
},
|
| 521 |
+
"mmmu_multiple_choice": {
|
| 522 |
+
"accuracy": 0.5903187721369539
|
| 523 |
+
},
|
| 524 |
+
"humaneval": {
|
| 525 |
+
"mean": 0.9085365853658537
|
| 526 |
+
},
|
| 527 |
+
"math": {
|
| 528 |
+
"accuracy": 0.7054
|
| 529 |
+
},
|
| 530 |
+
"hellaswag": {
|
| 531 |
+
"accuracy": 0.924317864967138
|
| 532 |
+
},
|
| 533 |
+
"gaia": {
|
| 534 |
+
"accuracy": 0.16606060606060608
|
| 535 |
+
},
|
| 536 |
+
"gdm_intercode_ctf": {
|
| 537 |
+
"accuracy": 0.6379746835443038
|
| 538 |
+
},
|
| 539 |
+
"gdm_in_house_ctf": {
|
| 540 |
+
"accuracy": 0.23076923076923078
|
| 541 |
+
},
|
| 542 |
+
"agentharm": {
|
| 543 |
+
"avg_score": 0.49953844451003543
|
| 544 |
+
},
|
| 545 |
+
"agentharm_benign": {
|
| 546 |
+
"avg_score": 0.8249433048012594
|
| 547 |
+
},
|
| 548 |
+
"swe_bench": {
|
| 549 |
+
"mean": 0.012
|
| 550 |
+
}
|
| 551 |
+
}
|
| 552 |
+
},
|
| 553 |
+
"gpt-4o-mini": {
|
| 554 |
+
"config": {
|
| 555 |
+
"model_name": "gpt-4o-mini",
|
| 556 |
+
"model_sha": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
| 557 |
+
"model_dtype": "torch.float16"
|
| 558 |
+
},
|
| 559 |
+
"results": {
|
| 560 |
+
"drop": {
|
| 561 |
+
"mean": 0.8065915049816466
|
| 562 |
+
},
|
| 563 |
+
"humaneval": {
|
| 564 |
+
"mean": 0.8597560975609756
|
| 565 |
+
},
|
| 566 |
+
"gpqa_diamond": {
|
| 567 |
+
"accuracy": 0.3838383838383838
|
| 568 |
+
},
|
| 569 |
+
"mmmu_open": {
|
| 570 |
+
"accuracy": 0.18867924528301888
|
| 571 |
+
},
|
| 572 |
+
"arc_challenge": {
|
| 573 |
+
"accuracy": 0.9249146757679181
|
| 574 |
+
},
|
| 575 |
+
"mmlu": {
|
| 576 |
+
"accuracy": 0.7698333570716422
|
| 577 |
+
},
|
| 578 |
+
"hellaswag": {
|
| 579 |
+
"accuracy": 0.8750248954391555
|
| 580 |
+
},
|
| 581 |
+
"ifeval": {
|
| 582 |
+
"final_acc": 0.8419061423689144
|
| 583 |
+
},
|
| 584 |
+
"mmmu_multiple_choice": {
|
| 585 |
+
"accuracy": 0.5395513577331759
|
| 586 |
+
},
|
| 587 |
+
"arc_easy": {
|
| 588 |
+
"accuracy": 0.9793771043771043
|
| 589 |
+
},
|
| 590 |
+
"winogrande": {
|
| 591 |
+
"accuracy": 0.7529597474348856
|
| 592 |
+
},
|
| 593 |
+
"mmlu_pro": {
|
| 594 |
+
"accuracy": 0.6396276595744681
|
| 595 |
+
},
|
| 596 |
+
"math": {
|
| 597 |
+
"accuracy": 0.633
|
| 598 |
+
},
|
| 599 |
+
"gsm8k": {
|
| 600 |
+
"accuracy": 0.9181197877179682
|
| 601 |
+
},
|
| 602 |
+
"gaia": {
|
| 603 |
+
"accuracy": null
|
| 604 |
+
},
|
| 605 |
+
"gdm_intercode_ctf": {
|
| 606 |
+
"accuracy": null
|
| 607 |
+
},
|
| 608 |
+
"gdm_in_house_ctf": {
|
| 609 |
+
"accuracy": null
|
| 610 |
+
},
|
| 611 |
+
"agentharm": {
|
| 612 |
+
"avg_score": null
|
| 613 |
+
},
|
| 614 |
+
"agentharm_benign": {
|
| 615 |
+
"avg_score": null
|
| 616 |
+
},
|
| 617 |
+
"swe_bench": {
|
| 618 |
+
"mean": null
|
| 619 |
+
}
|
| 620 |
+
}
|
| 621 |
+
},
|
| 622 |
+
"o1": {
|
| 623 |
+
"config": {
|
| 624 |
+
"model_name": "o1",
|
| 625 |
+
"model_sha": "https://openai.com/o1",
|
| 626 |
+
"model_dtype": "torch.float16"
|
| 627 |
+
},
|
| 628 |
+
"results": {
|
| 629 |
+
"winogrande": {
|
| 630 |
+
"accuracy": 0.9392265193370166
|
| 631 |
+
},
|
| 632 |
+
"humaneval": {
|
| 633 |
+
"mean": 0.9695121951219512
|
| 634 |
+
},
|
| 635 |
+
"mmmu_open": {
|
| 636 |
+
"accuracy": 0.6981132075471698
|
| 637 |
+
},
|
| 638 |
+
"math": {
|
| 639 |
+
"accuracy": 0.959
|
| 640 |
+
},
|
| 641 |
+
"arc_easy": {
|
| 642 |
+
"accuracy": 0.9911616161616161
|
| 643 |
+
},
|
| 644 |
+
"arc_challenge": {
|
| 645 |
+
"accuracy": 0.9786689419795221
|
| 646 |
+
},
|
| 647 |
+
"gsm8k": {
|
| 648 |
+
"accuracy": 0.9416224412433661
|
| 649 |
+
},
|
| 650 |
+
"gpqa_diamond": {
|
| 651 |
+
"accuracy": 0.7550505050505051
|
| 652 |
+
},
|
| 653 |
+
"mmlu_pro": {
|
| 654 |
+
"accuracy": 0.8447473404255319
|
| 655 |
+
},
|
| 656 |
+
"mmmu_multiple_choice": {
|
| 657 |
+
"accuracy": 0.8063754427390791
|
| 658 |
+
},
|
| 659 |
+
"drop": {
|
| 660 |
+
"mean": null
|
| 661 |
+
},
|
| 662 |
+
"hellaswag": {
|
| 663 |
+
"accuracy": null
|
| 664 |
+
},
|
| 665 |
+
"ifeval": {
|
| 666 |
+
"final_acc": null
|
| 667 |
+
},
|
| 668 |
+
"mmlu": {
|
| 669 |
+
"accuracy": null
|
| 670 |
+
},
|
| 671 |
+
"gaia": {
|
| 672 |
+
"accuracy": 0.41090909090909084
|
| 673 |
+
},
|
| 674 |
+
"gdm_intercode_ctf": {
|
| 675 |
+
"accuracy": 0.8481012658227849
|
| 676 |
+
},
|
| 677 |
+
"gdm_in_house_ctf": {
|
| 678 |
+
"accuracy": 0.46153846153846156
|
| 679 |
+
},
|
| 680 |
+
"agentharm": {
|
| 681 |
+
"avg_score": 0.08782061688311688
|
| 682 |
+
},
|
| 683 |
+
"agentharm_benign": {
|
| 684 |
+
"avg_score": 0.7235176849665487
|
| 685 |
+
},
|
| 686 |
+
"swe_bench": {
|
| 687 |
+
"mean": 0.0036
|
| 688 |
+
}
|
| 689 |
+
}
|
| 690 |
+
},
|
| 691 |
+
"o3-mini": {
|
| 692 |
+
"config": {
|
| 693 |
+
"model_name": "o3-mini",
|
| 694 |
+
"model_sha": "https://openai.com/index/openai-o3-mini",
|
| 695 |
+
"model_dtype": "torch.float16"
|
| 696 |
+
},
|
| 697 |
+
"results": {
|
| 698 |
+
"math": {
|
| 699 |
+
"accuracy": 0.9691320905993185
|
| 700 |
+
},
|
| 701 |
+
"humaneval": {
|
| 702 |
+
"mean": 0.9817073170731707
|
| 703 |
+
},
|
| 704 |
+
"mmlu_pro": {
|
| 705 |
+
"accuracy": 0.7924606807023383
|
| 706 |
+
},
|
| 707 |
+
"gpqa_diamond": {
|
| 708 |
+
"accuracy": 0.7365319865319865
|
| 709 |
+
},
|
| 710 |
+
"winogrande": {
|
| 711 |
+
"accuracy": 0.8492501973164956
|
| 712 |
+
},
|
| 713 |
+
"gsm8k": {
|
| 714 |
+
"accuracy": 0.9454131918119788
|
| 715 |
+
},
|
| 716 |
+
"arc_challenge": {
|
| 717 |
+
"accuracy": 0.9641638225255973
|
| 718 |
+
},
|
| 719 |
+
"arc_easy": {
|
| 720 |
+
"accuracy": 0.9755892255892256
|
| 721 |
+
},
|
| 722 |
+
"drop": {
|
| 723 |
+
"mean": null
|
| 724 |
+
},
|
| 725 |
+
"hellaswag": {
|
| 726 |
+
"accuracy": null
|
| 727 |
+
},
|
| 728 |
+
"ifeval": {
|
| 729 |
+
"final_acc": null
|
| 730 |
+
},
|
| 731 |
+
"mmlu": {
|
| 732 |
+
"accuracy": null
|
| 733 |
+
},
|
| 734 |
+
"mmmu_multiple_choice": {
|
| 735 |
+
"accuracy": null
|
| 736 |
+
},
|
| 737 |
+
"mmmu_open": {
|
| 738 |
+
"accuracy": null
|
| 739 |
+
},
|
| 740 |
+
"gaia": {
|
| 741 |
+
"accuracy": 0.27030303030303043
|
| 742 |
+
},
|
| 743 |
+
"gdm_intercode_ctf": {
|
| 744 |
+
"accuracy": 0.8278481012658225
|
| 745 |
+
},
|
| 746 |
+
"gdm_in_house_ctf": {
|
| 747 |
+
"accuracy": 0.38461538461538464
|
| 748 |
+
},
|
| 749 |
+
"agentharm": {
|
| 750 |
+
"avg_score": 0.1241931080283353
|
| 751 |
+
},
|
| 752 |
+
"agentharm_benign": {
|
| 753 |
+
"avg_score": 0.5429306867375049
|
| 754 |
+
},
|
| 755 |
+
"swe_bench": {
|
| 756 |
+
"mean": 0.0024
|
| 757 |
+
}
|
| 758 |
+
}
|
| 759 |
+
}
|
| 760 |
+
}
|
data/tasks.json
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"arc_easy": {
|
| 3 |
+
"benchmark": "arc_easy",
|
| 4 |
+
"metric": "accuracy",
|
| 5 |
+
"display_name": "ARC-Easy",
|
| 6 |
+
"type": "base",
|
| 7 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
| 8 |
+
},
|
| 9 |
+
"arc_challenge": {
|
| 10 |
+
"benchmark": "arc_challenge",
|
| 11 |
+
"metric": "accuracy",
|
| 12 |
+
"display_name": "ARC-Challenge",
|
| 13 |
+
"type": "base",
|
| 14 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
|
| 15 |
+
},
|
| 16 |
+
"drop": {
|
| 17 |
+
"benchmark": "drop",
|
| 18 |
+
"metric": "mean",
|
| 19 |
+
"display_name": "DROP",
|
| 20 |
+
"type": "base",
|
| 21 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop"
|
| 22 |
+
},
|
| 23 |
+
"winogrande": {
|
| 24 |
+
"benchmark": "winogrande",
|
| 25 |
+
"metric": "accuracy",
|
| 26 |
+
"display_name": "WinoGrande",
|
| 27 |
+
"type": "base",
|
| 28 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande"
|
| 29 |
+
},
|
| 30 |
+
"gsm8k": {
|
| 31 |
+
"benchmark": "gsm8k",
|
| 32 |
+
"metric": "accuracy",
|
| 33 |
+
"display_name": "GSM8K",
|
| 34 |
+
"type": "base",
|
| 35 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k"
|
| 36 |
+
},
|
| 37 |
+
"hellaswag": {
|
| 38 |
+
"benchmark": "hellaswag",
|
| 39 |
+
"metric": "accuracy",
|
| 40 |
+
"display_name": "HellaSwag",
|
| 41 |
+
"type": "base",
|
| 42 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag"
|
| 43 |
+
},
|
| 44 |
+
"humaneval": {
|
| 45 |
+
"benchmark": "humaneval",
|
| 46 |
+
"metric": "mean",
|
| 47 |
+
"display_name": "HumanEval",
|
| 48 |
+
"type": "base",
|
| 49 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval"
|
| 50 |
+
},
|
| 51 |
+
"ifeval": {
|
| 52 |
+
"benchmark": "ifeval",
|
| 53 |
+
"metric": "final_acc",
|
| 54 |
+
"display_name": "IFEval",
|
| 55 |
+
"type": "base",
|
| 56 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval"
|
| 57 |
+
},
|
| 58 |
+
"math": {
|
| 59 |
+
"benchmark": "math",
|
| 60 |
+
"metric": "accuracy",
|
| 61 |
+
"display_name": "MATH",
|
| 62 |
+
"type": "base",
|
| 63 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics"
|
| 64 |
+
},
|
| 65 |
+
"mmlu": {
|
| 66 |
+
"benchmark": "mmlu",
|
| 67 |
+
"metric": "accuracy",
|
| 68 |
+
"display_name": "MMLU",
|
| 69 |
+
"type": "base",
|
| 70 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu"
|
| 71 |
+
},
|
| 72 |
+
"mmlu_pro": {
|
| 73 |
+
"benchmark": "mmlu_pro",
|
| 74 |
+
"metric": "accuracy",
|
| 75 |
+
"display_name": "MMLU-Pro",
|
| 76 |
+
"type": "base",
|
| 77 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro"
|
| 78 |
+
},
|
| 79 |
+
"gpqa_diamond": {
|
| 80 |
+
"benchmark": "gpqa_diamond",
|
| 81 |
+
"metric": "accuracy",
|
| 82 |
+
"display_name": "GPQA-Diamond",
|
| 83 |
+
"type": "base",
|
| 84 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
|
| 85 |
+
},
|
| 86 |
+
"mmmu_multiple_choice": {
|
| 87 |
+
"benchmark": "mmmu_multiple_choice",
|
| 88 |
+
"metric": "accuracy",
|
| 89 |
+
"display_name": "MMMU-Multiple-Choice",
|
| 90 |
+
"type": "base",
|
| 91 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
|
| 92 |
+
},
|
| 93 |
+
"mmmu_open": {
|
| 94 |
+
"benchmark": "mmmu_open",
|
| 95 |
+
"metric": "accuracy",
|
| 96 |
+
"display_name": "MMMU-Open-Ended",
|
| 97 |
+
"type": "base",
|
| 98 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
|
| 99 |
+
},
|
| 100 |
+
"gaia": {
|
| 101 |
+
"benchmark": "gaia",
|
| 102 |
+
"metric": "accuracy",
|
| 103 |
+
"display_name": "GAIA",
|
| 104 |
+
"type": "agentic",
|
| 105 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
|
| 106 |
+
},
|
| 107 |
+
"gdm_intercode_ctf": {
|
| 108 |
+
"benchmark": "gdm_intercode_ctf",
|
| 109 |
+
"metric": "accuracy",
|
| 110 |
+
"display_name": "InterCode-CTF",
|
| 111 |
+
"type": "agentic",
|
| 112 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf"
|
| 113 |
+
},
|
| 114 |
+
"gdm_in_house_ctf": {
|
| 115 |
+
"benchmark": "gdm_in_house_ctf",
|
| 116 |
+
"metric": "accuracy",
|
| 117 |
+
"display_name": "In-House-CTF",
|
| 118 |
+
"type": "agentic",
|
| 119 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf"
|
| 120 |
+
},
|
| 121 |
+
"agentharm": {
|
| 122 |
+
"benchmark": "agentharm",
|
| 123 |
+
"metric": "avg_score",
|
| 124 |
+
"display_name": "AgentHarm",
|
| 125 |
+
"type": "agentic",
|
| 126 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
|
| 127 |
+
},
|
| 128 |
+
"agentharm_benign": {
|
| 129 |
+
"benchmark": "agentharm_benign",
|
| 130 |
+
"metric": "avg_score",
|
| 131 |
+
"display_name": "AgentHarm-Benign",
|
| 132 |
+
"type": "agentic",
|
| 133 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
|
| 134 |
+
},
|
| 135 |
+
"swe_bench": {
|
| 136 |
+
"benchmark": "swe_bench",
|
| 137 |
+
"metric": "mean",
|
| 138 |
+
"display_name": "SWE-Bench",
|
| 139 |
+
"type": "agentic",
|
| 140 |
+
"source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench"
|
| 141 |
+
}
|
| 142 |
+
}
|
src/about.py
CHANGED
|
@@ -33,18 +33,13 @@ class Tasks(Enum):
|
|
| 33 |
|
| 34 |
# agentic
|
| 35 |
task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
| 36 |
-
task15 = Task("gdm_intercode_ctf", "accuracy", "
|
| 37 |
-
task16 = Task("gdm_in_house_ctf", "accuracy", "
|
| 38 |
task17 = Task("agentharm", "avg_score", "AgentHarm", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
| 39 |
task18 = Task("agentharm_benign", "avg_score", "AgentHarm-Benign", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
| 40 |
task19 = Task("swe_bench", "mean", "SWE-Bench", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench")
|
| 41 |
|
| 42 |
|
| 43 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
| 44 |
-
# ---------------------------------------------------
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
# Your leaderboard name
|
| 49 |
TITLE = """<h1 align="center" id="space-title">State of Evaluation Leaderboard</h1>"""
|
| 50 |
|
|
|
|
| 33 |
|
| 34 |
# agentic
|
| 35 |
task14 = Task("gaia", "accuracy", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
| 36 |
+
task15 = Task("gdm_intercode_ctf", "accuracy", "InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
|
| 37 |
+
task16 = Task("gdm_in_house_ctf", "accuracy", "In-House-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf")
|
| 38 |
task17 = Task("agentharm", "avg_score", "AgentHarm", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
| 39 |
task18 = Task("agentharm_benign", "avg_score", "AgentHarm-Benign", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm")
|
| 40 |
task19 = Task("swe_bench", "mean", "SWE-Bench", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench")
|
| 41 |
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Your leaderboard name
|
| 44 |
TITLE = """<h1 align="center" id="space-title">State of Evaluation Leaderboard</h1>"""
|
| 45 |
|
src/display/formatting.py
CHANGED
|
@@ -5,6 +5,8 @@ def model_hyperlink(link, model_name):
|
|
| 5 |
def make_clickable_model(model_name, model_sha):
|
| 6 |
return model_hyperlink(model_sha, model_name)
|
| 7 |
|
|
|
|
|
|
|
| 8 |
|
| 9 |
def styled_error(error):
|
| 10 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
|
|
|
| 5 |
def make_clickable_model(model_name, model_sha):
|
| 6 |
return model_hyperlink(model_sha, model_name)
|
| 7 |
|
| 8 |
+
def make_clickable_field(name, url):
|
| 9 |
+
return model_hyperlink(url, name)
|
| 10 |
|
| 11 |
def styled_error(error):
|
| 12 |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
src/submission/submit.py
CHANGED
|
@@ -1,119 +1,119 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
from datetime import datetime, timezone
|
| 4 |
-
|
| 5 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
| 7 |
-
from src.submission.check_validity import (
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
)
|
| 13 |
-
|
| 14 |
-
REQUESTED_MODELS = None
|
| 15 |
-
USERS_TO_SUBMISSION_DATES = None
|
| 16 |
-
|
| 17 |
-
def add_new_eval(
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
):
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
|
|
|
| 1 |
+
# import json
|
| 2 |
+
# import os
|
| 3 |
+
# from datetime import datetime, timezone
|
| 4 |
+
|
| 5 |
+
# from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
+
# from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
| 7 |
+
# from src.submission.check_validity import (
|
| 8 |
+
# already_submitted_models,
|
| 9 |
+
# check_model_card,
|
| 10 |
+
# get_model_size,
|
| 11 |
+
# is_model_on_hub,
|
| 12 |
+
# )
|
| 13 |
+
|
| 14 |
+
# REQUESTED_MODELS = None
|
| 15 |
+
# USERS_TO_SUBMISSION_DATES = None
|
| 16 |
+
|
| 17 |
+
# def add_new_eval(
|
| 18 |
+
# model: str,
|
| 19 |
+
# base_model: str,
|
| 20 |
+
# revision: str,
|
| 21 |
+
# precision: str,
|
| 22 |
+
# weight_type: str,
|
| 23 |
+
# model_type: str,
|
| 24 |
+
# ):
|
| 25 |
+
# global REQUESTED_MODELS
|
| 26 |
+
# global USERS_TO_SUBMISSION_DATES
|
| 27 |
+
# if not REQUESTED_MODELS:
|
| 28 |
+
# REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
| 29 |
+
|
| 30 |
+
# user_name = ""
|
| 31 |
+
# model_path = model
|
| 32 |
+
# if "/" in model:
|
| 33 |
+
# user_name = model.split("/")[0]
|
| 34 |
+
# model_path = model.split("/")[1]
|
| 35 |
+
|
| 36 |
+
# precision = precision.split(" ")[0]
|
| 37 |
+
# current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 38 |
+
|
| 39 |
+
# if model_type is None or model_type == "":
|
| 40 |
+
# return styled_error("Please select a model type.")
|
| 41 |
+
|
| 42 |
+
# # Does the model actually exist?
|
| 43 |
+
# if revision == "":
|
| 44 |
+
# revision = "main"
|
| 45 |
+
|
| 46 |
+
# # Is the model on the hub?
|
| 47 |
+
# if weight_type in ["Delta", "Adapter"]:
|
| 48 |
+
# base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 49 |
+
# if not base_model_on_hub:
|
| 50 |
+
# return styled_error(f'Base model "{base_model}" {error}')
|
| 51 |
+
|
| 52 |
+
# if not weight_type == "Adapter":
|
| 53 |
+
# model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
| 54 |
+
# if not model_on_hub:
|
| 55 |
+
# return styled_error(f'Model "{model}" {error}')
|
| 56 |
+
|
| 57 |
+
# # Is the model info correctly filled?
|
| 58 |
+
# try:
|
| 59 |
+
# model_info = API.model_info(repo_id=model, revision=revision)
|
| 60 |
+
# except Exception:
|
| 61 |
+
# return styled_error("Could not get your model information. Please fill it up properly.")
|
| 62 |
+
|
| 63 |
+
# model_size = get_model_size(model_info=model_info, precision=precision)
|
| 64 |
+
|
| 65 |
+
# # Were the model card and license filled?
|
| 66 |
+
# try:
|
| 67 |
+
# license = model_info.cardData["license"]
|
| 68 |
+
# except Exception:
|
| 69 |
+
# return styled_error("Please select a license for your model")
|
| 70 |
+
|
| 71 |
+
# modelcard_OK, error_msg = check_model_card(model)
|
| 72 |
+
# if not modelcard_OK:
|
| 73 |
+
# return styled_error(error_msg)
|
| 74 |
+
|
| 75 |
+
# # Seems good, creating the eval
|
| 76 |
+
# print("Adding new eval")
|
| 77 |
+
|
| 78 |
+
# eval_entry = {
|
| 79 |
+
# "model": model,
|
| 80 |
+
# "base_model": base_model,
|
| 81 |
+
# "revision": revision,
|
| 82 |
+
# "precision": precision,
|
| 83 |
+
# "weight_type": weight_type,
|
| 84 |
+
# "status": "PENDING",
|
| 85 |
+
# "submitted_time": current_time,
|
| 86 |
+
# "model_type": model_type,
|
| 87 |
+
# "likes": model_info.likes,
|
| 88 |
+
# "params": model_size,
|
| 89 |
+
# "license": license,
|
| 90 |
+
# "private": False,
|
| 91 |
+
# }
|
| 92 |
+
|
| 93 |
+
# # Check for duplicate submission
|
| 94 |
+
# if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
| 95 |
+
# return styled_warning("This model has been already submitted.")
|
| 96 |
+
|
| 97 |
+
# print("Creating eval file")
|
| 98 |
+
# OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
| 99 |
+
# os.makedirs(OUT_DIR, exist_ok=True)
|
| 100 |
+
# out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
| 101 |
+
|
| 102 |
+
# with open(out_path, "w") as f:
|
| 103 |
+
# f.write(json.dumps(eval_entry))
|
| 104 |
+
|
| 105 |
+
# print("Uploading eval file")
|
| 106 |
+
# API.upload_file(
|
| 107 |
+
# path_or_fileobj=out_path,
|
| 108 |
+
# path_in_repo=out_path.split("eval-queue/")[1],
|
| 109 |
+
# repo_id=QUEUE_REPO,
|
| 110 |
+
# repo_type="dataset",
|
| 111 |
+
# commit_message=f"Add {model} to eval queue",
|
| 112 |
+
# )
|
| 113 |
+
|
| 114 |
+
# # Remove the local file
|
| 115 |
+
# os.remove(out_path)
|
| 116 |
+
|
| 117 |
+
# return styled_message(
|
| 118 |
+
# "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
| 119 |
+
# )
|