File size: 4,897 Bytes
8b91831
 
 
 
 
eb538cb
8b91831
 
 
 
 
 
 
 
 
 
 
 
5438c77
 
8b91831
 
 
 
 
 
5438c77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b91831
 
 
 
 
 
 
5438c77
8b91831
 
 
 
 
 
 
 
5438c77
 
 
 
 
 
 
 
 
 
 
 
 
 
8b91831
5438c77
 
 
 
 
8b91831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e004342
 
 
18638a9
8b91831
 
 
 
 
 
 
 
 
9c55d6d
8b91831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import json
import os


METRIC_NAME = {
    # single-turn
    "arc_easy": "accuracy",
    "arc_challenge": "accuracy",
    "gpqa_diamond": "accuracy",
    "drop": "mean",
    "winogrande": "accuracy",
    "gsm8k": "accuracy",
    "hellaswag": "accuracy",
    "humaneval": "mean",
    "ifeval": "final_acc",
    "math": "accuracy",
    "mmlu": "accuracy",
    "mmlu_pro": "accuracy",
    "mmmu_multiple_choice": "accuracy",
    "mmmu_open": "accuracy",

    # agentic
    "gaia": "mean",
    "gdm_intercode_ctf": "accuracy",
}

MODEL_SHA_MAP = {
    # open source models
    "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version
    "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
    "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
    "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",

    # closed source models
    "claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
    "gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
    "gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
    "gpt-4o": "https://openai.com/index/hello-gpt-4o",
    "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
    "o1": "https://openai.com/o1",
}


def combine_eval_results(results_path: str, model_name: str) -> dict:
    results = dict(
        {
            "config": {
                "model_name": model_name,
                # dummy keys
                "model_sha": MODEL_SHA_MAP[model_name],
                "model_dtype": "torch.float16",
            },
            "results": {},
        }
    )
    for file in os.listdir(os.path.join(results_path, model_name)):
        if file.endswith(".json"):
            with open(os.path.join(results_path, model_name, file), "r") as f:
                try:
                    result = json.load(f)
                    task_name = result["eval"]["task"].split("/")[-1]
                    if task_name == "math":
                        metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
                    else:
                        metrics = result["results"]["scores"][0]["metrics"]
                    metric_name = metrics[METRIC_NAME[task_name]]["name"]
                    metric_value = metrics[METRIC_NAME[task_name]]["value"]
                    results["results"].update(
                        {
                            task_name: {
                                metric_name: metric_value
                            }
                        }
                    )
                except KeyError as e:
                    print(f"KeyError: {e}")
                    print(model_name)
                    print(file)
    return results


def main():

    CACHE_PATH=os.getenv("HF_HOME", ".")
    EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
    EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

    base_bm_input_path = "./base_benchmarking_logs"
    agentic_bm_input_path = "./agentic_benchmarking_logs"
    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
    os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)

    for model_name in os.listdir(base_bm_input_path):
        if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
            results = combine_eval_results(base_bm_input_path, model_name)
        # TMP: Add dummy agentic benchmarks to the results
        for metric in METRIC_NAME.items():
            if metric[0] not in results["results"]:
                results["results"].update({metric[0]: {metric[1]: None}})
        if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
            agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name)
            results["results"].update(agentic_bm_results["results"])
        with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
            json.dump(results, f, indent=4)        

        # Create dummy requests file
        requests = {
            "model": model_name,
            "model_sha": MODEL_SHA_MAP[model_name],
            "base_model": "",
            "revision": "main",
            "private": False,
            "precision": "float16",
            "weight_type": "Original",
            "status": "FINISHED",
            "submitted_time": "",
            "model_type": "pretrained",
            "likes": 0,
            "params": 0,
            "license": "custom",
        }
        with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
            json.dump(requests, f, indent=4)


if __name__ == "__main__":
    main()