import json import os METRIC_NAME = { # single-turn "arc_easy": "accuracy", "arc_challenge": "accuracy", "gpqa_diamond": "accuracy", "drop": "mean", "winogrande": "accuracy", "gsm8k": "accuracy", "hellaswag": "accuracy", "humaneval": "mean", "ifeval": "final_acc", "math": "accuracy", "mmlu": "accuracy", "mmlu_pro": "accuracy", "mmmu_multiple_choice": "accuracy", "mmmu_open": "accuracy", # agentic "gaia": "accuracy", "gdm_intercode_ctf": "accuracy", "gdm_in_house_ctf": "accuracy", } MODEL_SHA_MAP = { # open source models "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct", "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407", "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", # closed source models "claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet", "gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5 "gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro", "gpt-4o": "https://openai.com/index/hello-gpt-4o", "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence", "o1": "https://openai.com/o1", } AGENTIC_LOG_MODEL_NAME_MAP = { "claude-3-5-sonnet-20241022": "claude-3-5-sonnet-20241022", "gemini-1.5-pro": "gemini-1.5-pro-002", "gpt-4o": "gpt-4o-2024-08-06", "o1": "o1-2024-12-17", } AGENTIC_TASKS = ["gaia", "gdm-intercode-ctf", "gdm-in-house-ctf"] def combine_eval_results(results_path: str, model_name: str, type: str,) -> dict: results = dict( { "config": { "model_name": model_name, # dummy keys "model_sha": MODEL_SHA_MAP[model_name], "model_dtype": "torch.float16", }, "results": {}, } ) if type == "base": for file in os.listdir(os.path.join(results_path, model_name)): if file.endswith(".json"): with open(os.path.join(results_path, model_name, file), "r") as f: try: result = json.load(f) task_name = result["eval"]["task"].split("/")[-1] if task_name == "math": metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required else: metrics = result["results"]["scores"][0]["metrics"] metric_name = metrics[METRIC_NAME[task_name]]["name"] metric_value = metrics[METRIC_NAME[task_name]]["value"] results["results"].update( { task_name: { metric_name: metric_value } } ) except KeyError as e: print(f"KeyError: {e}") print(model_name) print(file) elif type == "agentic": model_name = AGENTIC_LOG_MODEL_NAME_MAP[model_name] # change name based on log file structure results_path = os.path.join(results_path, model_name) for task in AGENTIC_TASKS: for file in os.listdir(os.path.join(results_path, task)): if file.endswith(".json"): with open(os.path.join(results_path, task, file), "r") as f: try: result = json.load(f) task_name = result["eval"]["task"].split("/")[-1] metrics = result["results"]["scores"][0]["metrics"] metric_name = metrics[METRIC_NAME[task_name]]["name"] metric_value = metrics[METRIC_NAME[task_name]]["value"] results["results"].update( { task_name: { metric_name: metric_value } } ) except KeyError as e: print(f"KeyError: {e}") print(model_name) print(file) return results def main(): CACHE_PATH=os.getenv("HF_HOME", ".") EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") base_bm_input_path = "./base_benchmarking_logs" agentic_bm_input_path = "/fs01/projects/aieng/public/inspect_evals/agentic_benchmarking_runs" os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True) for model_name in os.listdir(base_bm_input_path): if os.path.isdir(os.path.join(base_bm_input_path, model_name)): results = combine_eval_results(base_bm_input_path, model_name, "base") # TMP: Add missing benchmarks to the results for metric in METRIC_NAME.items(): if metric[0] not in results["results"]: results["results"].update({metric[0]: {metric[1]: None}}) if os.path.isdir(os.path.join(agentic_bm_input_path, AGENTIC_LOG_MODEL_NAME_MAP.get(model_name, "NA"))): agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name, "agentic") results["results"].update(agentic_bm_results["results"]) with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f: json.dump(results, f, indent=4) # Create dummy requests file requests = { "model": model_name, "model_sha": MODEL_SHA_MAP[model_name], "base_model": "", "revision": "main", "private": False, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "", "model_type": "pretrained", "likes": 0, "params": 0, "license": "custom", } with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f: json.dump(requests, f, indent=4) if __name__ == "__main__": main()