|
import json |
|
import os |
|
|
|
|
|
METRIC_NAME = { |
|
|
|
"arc_easy": "accuracy", |
|
"arc_challenge": "accuracy", |
|
"gpqa_diamond": "accuracy", |
|
"drop": "mean", |
|
"winogrande": "accuracy", |
|
"gsm8k": "accuracy", |
|
"hellaswag": "accuracy", |
|
"humaneval": "mean", |
|
"ifeval": "final_acc", |
|
"math": "accuracy", |
|
"mmlu": "accuracy", |
|
"mmlu_pro": "accuracy", |
|
"mmmu_multiple_choice": "accuracy", |
|
"mmmu_open": "accuracy", |
|
|
|
|
|
"gaia": "mean", |
|
"gdm_intercode_ctf": "accuracy", |
|
} |
|
|
|
MODEL_SHA_MAP = { |
|
|
|
"c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", |
|
"Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct", |
|
"Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407", |
|
"Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct", |
|
|
|
|
|
"claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet", |
|
"gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", |
|
"gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro", |
|
"gpt-4o": "https://openai.com/index/hello-gpt-4o", |
|
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence", |
|
"o1": "https://openai.com/o1", |
|
} |
|
|
|
|
|
def combine_eval_results(results_path: str, model_name: str) -> dict: |
|
results = dict( |
|
{ |
|
"config": { |
|
"model_name": model_name, |
|
|
|
"model_sha": MODEL_SHA_MAP[model_name], |
|
"model_dtype": "torch.float16", |
|
}, |
|
"results": {}, |
|
} |
|
) |
|
for file in os.listdir(os.path.join(results_path, model_name)): |
|
if file.endswith(".json"): |
|
with open(os.path.join(results_path, model_name, file), "r") as f: |
|
try: |
|
result = json.load(f) |
|
task_name = result["eval"]["task"].split("/")[-1] |
|
if task_name == "math": |
|
metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] |
|
else: |
|
metrics = result["results"]["scores"][0]["metrics"] |
|
metric_name = metrics[METRIC_NAME[task_name]]["name"] |
|
metric_value = metrics[METRIC_NAME[task_name]]["value"] |
|
results["results"].update( |
|
{ |
|
task_name: { |
|
metric_name: metric_value |
|
} |
|
} |
|
) |
|
except KeyError as e: |
|
print(f"KeyError: {e}") |
|
print(model_name) |
|
print(file) |
|
return results |
|
|
|
|
|
def main(): |
|
|
|
CACHE_PATH=os.getenv("HF_HOME", ".") |
|
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results") |
|
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue") |
|
|
|
base_bm_input_path = "./base_benchmarking_logs" |
|
agentic_bm_input_path = "./agentic_benchmarking_logs" |
|
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) |
|
os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True) |
|
|
|
for model_name in os.listdir(base_bm_input_path): |
|
if os.path.isdir(os.path.join(base_bm_input_path, model_name)): |
|
results = combine_eval_results(base_bm_input_path, model_name) |
|
|
|
for metric in METRIC_NAME.items(): |
|
if metric[0] not in results["results"]: |
|
results["results"].update({metric[0]: {metric[1]: None}}) |
|
if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)): |
|
agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name) |
|
results["results"].update(agentic_bm_results["results"]) |
|
with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f: |
|
json.dump(results, f, indent=4) |
|
|
|
|
|
requests = { |
|
"model": model_name, |
|
"model_sha": MODEL_SHA_MAP[model_name], |
|
"base_model": "", |
|
"revision": "main", |
|
"private": False, |
|
"precision": "float16", |
|
"weight_type": "Original", |
|
"status": "FINISHED", |
|
"submitted_time": "", |
|
"model_type": "pretrained", |
|
"likes": 0, |
|
"params": 0, |
|
"license": "custom", |
|
} |
|
with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f: |
|
json.dump(requests, f, indent=4) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|