eval-leaderboard / refactor_eval_results.py
xeon27
Change model names to reflect version
954d8ee
raw
history blame
7.46 kB
import json
import os
METRIC_NAME = {
# single-turn
"arc_easy": "accuracy",
"arc_challenge": "accuracy",
"gpqa_diamond": "accuracy",
"drop": "mean",
"winogrande": "accuracy",
"gsm8k": "accuracy",
"hellaswag": "accuracy",
"humaneval": "mean",
"ifeval": "final_acc",
"math": "accuracy",
"mmlu": "accuracy",
"mmlu_pro": "accuracy",
"mmmu_multiple_choice": "accuracy",
"mmmu_open": "accuracy",
# agentic
"gaia": "accuracy",
"gdm_intercode_ctf": "accuracy",
"gdm_in_house_ctf": "accuracy",
"agentharm": "avg_score",
"agentharm_benign": "avg_score",
"swe_bench": "mean",
}
MODEL_SHA_MAP = {
# open source models
"c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
"Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
"Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
"Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
# closed source models
"claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
"gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
"gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
"o1": "https://openai.com/o1",
}
MODEL_VERSION_MAP = {
# open source models
"c4ai-command-r-plus": "c4ai-command-r-plus",
"Meta-Llama-3.1-70B-Instruct": "Llama-3.1-70B-Instruct",
"Mistral-Large-Instruct-2407": "Mistral-Large-Instruct-2407",
"Qwen2.5-72B-Instruct": "Qwen2.5-72B-Instruct",
# closed source models
"claude-3-5-sonnet-20241022": "Claude-3.5-Sonnet-20241022",
"gemini-1.5-flash": "Gemini-1.5-Flash",
"gemini-1.5-pro": "Gemini-1.5-Pro-002",
"gpt-4o": "GPT-4o-20240806",
"gpt-4o-mini": "GPT-4o-mini-20240718",
"o1": "o1-20241217",
}
AGENTIC_LOG_MODEL_NAME_MAP = {
"claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
"gemini-1.5-pro": "gemini-1.5-pro-002",
"gpt-4o": "gpt-4o-2024-08-06",
"o1": "o1-2024-12-17",
}
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf", "agentharm", "swe-bench"]
def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
results = dict(
{
"config": {
"model_name": model_name,
# dummy keys
"model_sha": MODEL_SHA_MAP[model_name],
"model_dtype": "torch.float16",
},
"results": {},
}
)
if type == "base":
for file in os.listdir(os.path.join(results_path, model_name)):
if file.endswith(".json"):
with open(os.path.join(results_path, model_name, file), "r") as f:
try:
result = json.load(f)
task_name = result["eval"]["task"].split("/")[-1]
if task_name == "math":
metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
else:
metrics = result["results"]["scores"][0]["metrics"]
metric_name = metrics[METRIC_NAME[task_name]]["name"]
metric_value = metrics[METRIC_NAME[task_name]]["value"]
results["results"].update(
{
task_name: {
metric_name: metric_value
}
}
)
except KeyError as e:
print(f"KeyError: {e}")
print(model_name)
print(file)
elif type == "agentic":
model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
results_path = os.path.join(results_path, model_name)
for task in AGENTIC_TASKS:
for file in os.listdir(os.path.join(results_path, task)):
if file.endswith(".json"):
with open(os.path.join(results_path, task, file), "r") as f:
try:
result = json.load(f)
task_name = result["eval"]["task"].split("/")[-1]
metrics = result["results"]["scores"][0]["metrics"]
metric_name = metrics[METRIC_NAME[task_name]]["name"].split("/")[-1]
metric_value = metrics[METRIC_NAME[task_name]]["value"]
results["results"].update(
{
task_name: {
metric_name: metric_value
}
}
)
except KeyError as e:
print(f"KeyError: {e}")
print(model_name)
print(file)
return results
def main():
CACHE_PATH=os.getenv("HF_HOME", ".")
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
base_bm_input_path = "./base_benchmarking_logs"
agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
for model_name in os.listdir(base_bm_input_path):
if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
results = combine_eval_results(base_bm_input_path, model_name, "base")
# TMP: Add missing benchmarks to the results
for metric in METRIC_NAME.items():
if metric[0] not in results["results"]:
results["results"].update({metric[0]: {metric[1]: None}})
if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
results["results"].update(agentic_bm_results["results"])
with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
json.dump(results, f, indent=4)
# Create dummy requests file
requests = {
"model": model_name,
"model_sha": MODEL_SHA_MAP[model_name],
"model_version": MODEL_VERSION_MAP[model_name],
"base_model": "",
"revision": "main",
"private": False,
"precision": "float16",
"weight_type": "Original",
"status": "FINISHED",
"submitted_time": "",
"model_type": "pretrained",
"likes": 0,
"params": 0,
"license": "custom",
}
with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
json.dump(requests, f, indent=4)
if __name__ == "__main__":
main()