File size: 4,531 Bytes
f3cd231
 
7fa11aa
a48b15f
cdeeed6
e64ca4e
cdeeed6
7fa11aa
 
a48b15f
7fa11aa
53d5dd8
ce6be70
7fa11aa
 
ce6be70
a48b15f
5d4059c
 
 
 
 
ce6be70
5d4059c
 
 
 
 
1072829
5d4059c
7fa11aa
1e010df
 
 
 
 
53d5dd8
7fa11aa
cdeeed6
 
 
 
 
 
a48b15f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3cd231
 
 
 
 
0f7de99
f3cd231
 
 
 
 
 
 
 
0f7de99
f3cd231
75b9622
e64ca4e
f3cd231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30bd486
 
f3cd231
a48b15f
 
 
e64ca4e
 
 
0f7de99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import datasets
import numpy as np
from huggingface_hub import HfApi

from functools import lru_cache
from datasets.exceptions import DatasetNotFoundError


def get_leaderboard_models():
    api = HfApi()
    
    # List all datasets in the open-llm-leaderboard organization
    dataset_list = api.list_datasets(author="open-llm-leaderboard")
    
    models = []
    for dataset in dataset_list:
        if dataset.id.endswith("-details"):
            dataset_id = dataset.id
            try:
                # Check if the dataset can be loaded
                check_gated = datasets.get_dataset_config_names(dataset_id)
                # Format: "open-llm-leaderboard/<provider>__<model_name>-details"
                model_part = dataset_id.split("/")[-1].replace("-details", "")
                if "__" in model_part:
                    provider, model = model_part.split("__", 1)
                    models.append(f"{provider}/{model}")
                else:
                    models.append(model_part)
            except Exception as e:
                pass
    
    # Save model list as txt file
    with open("models.txt", "w") as f:
        for model in models:
            f.write(model + "\n")

    return sorted(models)


@lru_cache(maxsize=1)
def get_leaderboard_models_cached():
    return get_leaderboard_models()


def get_leaderboard_datasets(model_ids):
    if model_ids is None:
        return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'ifeval', 'math_algebra_hard', 'math_counting_and_prob_hard', 'math_geometry_hard', 'math_intermediate_algebra_hard', 'math_num_theory_hard', 'math_prealgebra_hard', 'math_precalculus_hard', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']

    # Map each model to its corresponding leaderboard version
    leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]

    model_datasets = {}

    for model_id in leaderboard_model_ids:
        # Retrieve the list of available configuration names
        config_names = datasets.get_dataset_config_names(model_id)
        dataset_names = [name.split("__leaderboard_")[-1] for name in config_names]
        model_datasets[model_id] = set(dataset_names)

    # Compute the intersection of datasets across all models
    if model_datasets:
        common_datasets = set.intersection(*model_datasets.values())

    return sorted(common_datasets)
    

def filter_labels(doc):
    labels = []
    if "answer_index" in doc[0].keys():
        for d in doc:
            labels.append(d["answer_index"])
    else:
        for d in doc:
            if d["answer"] == "False":
                labels.append(0)
            elif d["answer"] == "True":
                labels.append(1)
            else:
                raise ValueError("Invalid label")
    return labels



def load_run_data(model_name, dataset_name):
    try:
        model_name = model_name.replace("/", "__")

        data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details",
                                    name=model_name + "__leaderboard_" + dataset_name,
                                    split="latest")
        data = data.sort("doc_id")
        data = data.to_dict()

        # Get log probabilities for each response
        log_probs = []
        for resp in data["filtered_resps"]:
            log_prob = np.array([float(option[0]) for option in resp])
            log_probs.append(log_prob)

        # Get ground truth labels
        labels = filter_labels(data["doc"])
        
    except Exception as e:
        print(e)
        log_probs = []
        labels = []

    return log_probs, labels


@lru_cache(maxsize=8)
def load_run_data_cached(model_name, dataset_name):
    return load_run_data(model_name, dataset_name)