File size: 6,745 Bytes
8b91831 eb538cb 8b91831 5438c77 8b91831 2718fde 8b91831 2718fde 8b91831 5438c77 2718fde 8b91831 2718fde 8b91831 5438c77 8b91831 2718fde 5438c77 2718fde 8b91831 2718fde 8b91831 2718fde 8b91831 2718fde e004342 18638a9 2718fde 8b91831 9c55d6d 8b91831 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import json
import os
METRIC_NAME = {
# single-turn
"arc_easy": "accuracy",
"arc_challenge": "accuracy",
"gpqa_diamond": "accuracy",
"drop": "mean",
"winogrande": "accuracy",
"gsm8k": "accuracy",
"hellaswag": "accuracy",
"humaneval": "mean",
"ifeval": "final_acc",
"math": "accuracy",
"mmlu": "accuracy",
"mmlu_pro": "accuracy",
"mmmu_multiple_choice": "accuracy",
"mmmu_open": "accuracy",
# agentic
"gaia": "accuracy",
"gdm_intercode_ctf": "accuracy",
"gdm_in_house_ctf": "accuracy",
}
MODEL_SHA_MAP = {
# open source models
"c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version
"Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
"Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
"Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
# closed source models
"claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
"gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
"gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
"o1": "https://openai.com/o1",
}
AGENTIC_LOG_MODEL_NAME_MAP = {
"claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022",
"gemini-1.5-pro": "gemini-1.5-pro-002",
"gpt-4o": "gpt-4o-2024-08-06",
"o1": "o1-2024-12-17",
}
AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf"]
def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict:
results = dict(
{
"config": {
"model_name": model_name,
# dummy keys
"model_sha": MODEL_SHA_MAP[model_name],
"model_dtype": "torch.float16",
},
"results": {},
}
)
if type == "base":
for file in os.listdir(os.path.join(results_path, model_name)):
if file.endswith(".json"):
with open(os.path.join(results_path, model_name, file), "r") as f:
try:
result = json.load(f)
task_name = result["eval"]["task"].split("/")[-1]
if task_name == "math":
metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
else:
metrics = result["results"]["scores"][0]["metrics"]
metric_name = metrics[METRIC_NAME[task_name]]["name"]
metric_value = metrics[METRIC_NAME[task_name]]["value"]
results["results"].update(
{
task_name: {
metric_name: metric_value
}
}
)
except KeyError as e:
print(f"KeyError: {e}")
print(model_name)
print(file)
elif type == "agentic":
model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure
results_path = os.path.join(results_path, model_name)
for task in AGENTIC_TASKS:
for file in os.listdir(os.path.join(results_path, task)):
if file.endswith(".json"):
with open(os.path.join(results_path, task, file), "r") as f:
try:
result = json.load(f)
task_name = result["eval"]["task"].split("/")[-1]
metrics = result["results"]["scores"][0]["metrics"]
metric_name = metrics[METRIC_NAME[task_name]]["name"]
metric_value = metrics[METRIC_NAME[task_name]]["value"]
results["results"].update(
{
task_name: {
metric_name: metric_value
}
}
)
except KeyError as e:
print(f"KeyError: {e}")
print(model_name)
print(file)
return results
def main():
CACHE_PATH=os.getenv("HF_HOME", ".")
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
base_bm_input_path = "./base_benchmarking_logs"
agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs"
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
for model_name in os.listdir(base_bm_input_path):
if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
results = combine_eval_results(base_bm_input_path, model_name, "base")
# TMP: Add missing benchmarks to the results
for metric in METRIC_NAME.items():
if metric[0] not in results["results"]:
results["results"].update({metric[0]: {metric[1]: None}})
if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))):
agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic")
results["results"].update(agentic_bm_results["results"])
with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
json.dump(results, f, indent=4)
# Create dummy requests file
requests = {
"model": model_name,
"model_sha": MODEL_SHA_MAP[model_name],
"base_model": "",
"revision": "main",
"private": False,
"precision": "float16",
"weight_type": "Original",
"status": "FINISHED",
"submitted_time": "",
"model_type": "pretrained",
"likes": 0,
"params": 0,
"license": "custom",
}
with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
json.dump(requests, f, indent=4)
if __name__ == "__main__":
main()
|