File size: 3,360 Bytes
8b91831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
import os


METRIC_NAME = {
    # base
    "arc_easy": "accuracy",
    "arc_challenge": "accuracy",
    "gpqa_diamond": "accuracy",
    "drop": "mean",
    "winogrande": "accuracy",
    "gsm8k": "accuracy",
    "hellaswag": "accuracy",
    "humaneval": "mean",
    "ifeval": "final_acc",
    "math": "accuracy",
    "mmlu": "accuracy",
    "mmlu_pro": "accuracy",

    # agentic
    "gaia": "mean",
    "gdm_intercode_ctf": "accuracy",
}


def combine_eval_results(results_path: str, model_name: str) -> dict:
    results = dict(
        {
            "config": {
                "model_name": model_name,
                # dummy keys
                "model_sha": model_name,
                "model_dtype": "torch.float16",
            },
            "results": {},
        }
    )
    for file in os.listdir(os.path.join(results_path, model_name)):
        if file.endswith(".json"):
            with open(os.path.join(results_path, model_name, file), "r") as f:
                result = json.load(f)
                task_name = result["eval"]["task"].split("/")[-1]
                if task_name == "math":
                    metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
                else:
                    metrics = result["results"]["scores"][0]["metrics"]
                metric_name = metrics[METRIC_NAME[task_name]]["name"]
                metric_value = metrics[METRIC_NAME[task_name]]["value"]
                results["results"].update(
                    {
                        task_name: {
                            metric_name: metric_value
                        }
                    }
                )
    return results


def main():

    CACHE_PATH=os.getenv("HF_HOME", ".")
    EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
    EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

    base_bm_input_path = "./base_benchmarking_logs"
    agentic_bm_input_path = "./agentic_benchmarking_logs"
    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
    os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)

    for model_name in os.listdir(base_bm_input_path):
        if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
            results = combine_eval_results(base_bm_input_path, model_name)
        if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
            agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name)
            results["results"].update(agentic_bm_results["results"])
        with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
            json.dump(results, f, indent=4)        

        # Create dummy requests file
        requests = {
            "model": model_name,
            "base_model": "",
            "revision": "main",
            "private": False,
            "precision": "float16",
            "weight_type": "Original",
            "status": "FINISHED",
            "submitted_time": "",
            "model_type": "pretrained",
            "likes": 0,
            "params": 0,
            "license": "custom",
        }
        with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
            json.dump(requests, f, indent=4)


if __name__ == "__main__":
    main()