File size: 3,943 Bytes
f3cd231
 
7fa11aa
a48b15f
cdeeed6
 
7fa11aa
 
a48b15f
7fa11aa
53d5dd8
a48b15f
7fa11aa
 
a48b15f
 
 
 
 
 
 
 
 
7fa11aa
53d5dd8
7fa11aa
cdeeed6
e1a6930
cdeeed6
 
 
 
 
a48b15f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3cd231
 
 
 
 
0f7de99
f3cd231
 
 
 
 
 
 
 
0f7de99
f3cd231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30bd486
 
f3cd231
a48b15f
 
 
0f7de99
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import datasets
import numpy as np
from huggingface_hub import HfApi

from functools import lru_cache


def get_leaderboard_models():
    api = HfApi()
    
    # List all datasets in the open-llm-leaderboard organization
    datasets = api.list_datasets(author="open-llm-leaderboard")
    
    models = []
    for dataset in datasets:
        if dataset.id.endswith("-details"):
            # Format: "open-llm-leaderboard/<provider>__<model_name>-details"
            model_part = dataset.id.split("/")[-1].replace("-details", "")
            if "__" in model_part:
                provider, model = model_part.split("__", 1)
                models.append(f"{provider}/{model}")
            else:
                models.append(model_part)
    
    return sorted(models)



@lru_cache(maxsize=1)
def get_leaderboard_models_cached():
    return get_leaderboard_models()


def get_leaderboard_datasets(model_ids):
    if model_ids is None:
        return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'ifeval', 'math_algebra_hard', 'math_counting_and_prob_hard', 'math_geometry_hard', 'math_intermediate_algebra_hard', 'math_num_theory_hard', 'math_prealgebra_hard', 'math_precalculus_hard', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']

    # Map each model to its corresponding leaderboard version
    leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]

    model_datasets = {}

    for model_id in leaderboard_model_ids:
        # Retrieve the list of available configuration names
        config_names = datasets.get_dataset_config_names(model_id)
        dataset_names = [name.split("__leaderboard_")[-1] for name in config_names]
        model_datasets[model_id] = set(dataset_names)

    # Compute the intersection of datasets across all models
    if model_datasets:
        common_datasets = set.intersection(*model_datasets.values())

    return sorted(common_datasets)
    

def filter_labels(doc):
    labels = []
    if "answer_index" in doc[0].keys():
        for d in doc:
            labels.append(d["answer_index"])
    else:
        for d in doc:
            if d["answer"] == "False":
                labels.append(0)
            elif d["answer"] == "True":
                labels.append(1)
            else:
                raise ValueError("Invalid label")
    return labels

def load_run_data(model_name, dataset_name):
    try:
        model_name = model_name.replace("/", "__")

        data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details",
                                    name=model_name + "__leaderboard_" + dataset_name,
                                    split="latest")
        data = data.sort("doc_id")
        data = data.to_dict()

        # Get log probabilities for each response
        log_probs = []
        for resp in data["filtered_resps"]:
            log_prob = np.array([float(option[0]) for option in resp])
            log_probs.append(log_prob)

        # Get ground truth labels
        labels = filter_labels(data["doc"])
        
    except Exception as e:
        print(e)
        log_probs = []
        labels = []

    return log_probs, labels