Spaces:
Running
Running
File size: 6,623 Bytes
f3cd231 7fa11aa a48b15f cdeeed6 b16e2d1 7fa11aa ec5f717 a48b15f 93d753c 81438ca b90e0d3 81438ca 715aed5 7fa11aa 53d5dd8 ce6be70 7fa11aa 047f32f ce6be70 a48b15f 93d753c 047f32f 93d753c 9f3c166 93d753c 7fa11aa 047f32f e604b65 047f32f 1e010df b90e0d3 1e010df 53d5dd8 7fa11aa cdeeed6 ec5f717 b90e0d3 ec5f717 cdeeed6 a48b15f 0a42e99 a48b15f bf2618d c608f7f bf2618d a48b15f f3cd231 0a42e99 f3cd231 0a42e99 f3cd231 2d8352e b16e2d1 3dfa66b 0a42e99 f3cd231 0a42e99 f3cd231 0a42e99 3dfa66b 64789e4 c608f7f 0a42e99 0f7de99 f3cd231 75b9622 0a42e99 e64ca4e f3cd231 0a42e99 7c4f6b6 0a42e99 f3cd231 30bd486 f3cd231 a48b15f e64ca4e 0f7de99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import datasets
import numpy as np
from huggingface_hub import HfApi
from functools import lru_cache
from src.utils import opt_to_index, get_test_target
def get_leaderboard_models_reload():
api = HfApi()
# Load prechecked models
try:
ungated_models = set(line.strip() for line in open("data/models.txt"))
except FileNotFoundError:
ungated_models = set()
print(ungated_models)
print(f"Number of prechecked models: {len(ungated_models)}")
# List all datasets in the open-llm-leaderboard organization
dataset_list = api.list_datasets(author="open-llm-leaderboard")
models = []
count_api_calls = 0
for dataset in dataset_list:
if dataset.id.endswith("-details"):
# Format: "open-llm-leaderboard/<provider>__<model_name>-details"
model_part = dataset.id.split("/")[-1].replace("-details", "")
if "__" in model_part:
provider, model = model_part.split("__", 1)
model_name = f"{provider}/{model}"
else:
model_name = model_part
# Only perform the check if dataset_id is not in the ungated_models list.
if model_name not in ungated_models:
try:
count_api_calls += 1
# Check if the dataset can be loaded; if not, skip it.
datasets.get_dataset_config_names(dataset.id)
except Exception as e:
continue # Skip dataset if an exception occurs
models.append(model_name)
print(f"API calls: {count_api_calls}")
print(f"Number of models: {len(models)}")
# Save model list as txt file
with open("data/models.txt", "w") as f:
for model in models:
f.write(model + "\n")
return sorted(models)
def get_leaderboard_models():
# Load prechecked (ungated) models
with open("data/models.txt", "r") as f:
ungated_models = [line.strip() for line in f]
return sorted(ungated_models)
@lru_cache(maxsize=1)
def get_leaderboard_models_cached():
return get_leaderboard_models()
def get_leaderboard_datasets(model_ids):
if model_ids is None:
return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
# Map each model to its corresponding leaderboard version
leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
model_datasets = {}
for model_id in leaderboard_model_ids:
# Retrieve the list of available configuration names
config_names = datasets.get_dataset_config_names(model_id)
dataset_names = [name.split("__leaderboard_")[-1] for name in config_names]
model_datasets[model_id] = set(dataset_names)
# Compute the intersection of datasets across all models
if model_datasets:
common_datasets = set.intersection(*model_datasets.values())
# Filter datasets that are not MCQ or currently do not work
ignore = ["bbh_temporal_sequences", "math_", "ifeval"]
discard = []
for dataset in common_datasets:
for ignore_data in ignore:
if ignore_data in dataset:
discard.append(dataset)
common_datasets = [dataset for dataset in common_datasets if dataset not in discard]
return sorted(common_datasets)
def filter_labels(dataset_name, doc):
labels = []
test_target, target_key = get_test_target(doc[0])
if "answer_index" in doc[0].keys():
labels = [d["answer_index"] for d in doc]
elif test_target.startswith("(") or test_target.isalpha():
labels = [opt_to_index(d[target_key]) for d in doc]
elif dataset_name in ["bbh_boolean_expressions"]:
for d in doc:
if d[target_key] == "True":
labels.append(1)
elif d[target_key] == "False":
labels.append(0)
elif dataset_name in ["bbh_causal_judgement", "bbh_navigate", "bbh_web_of_lies"]:
for d in doc:
if d[target_key] == "Yes":
labels.append(0)
elif d[target_key] == "No":
labels.append(1)
elif dataset_name in ["bbh_formal_fallacies"]:
for d in doc:
if d[target_key] == "valid":
labels.append(0)
elif d[target_key] == "invalid":
labels.append(1)
elif dataset_name in ["bbh_sports_understanding"]:
for d in doc:
if d[target_key] == "yes":
labels.append(0)
elif d[target_key] == "no":
labels.append(1)
elif test_target.isdigit():
labels = [int(d[target_key]) for d in doc]
print(f"Number of labels: {len(labels)}")
return labels
def filter_responses(data):
# Get log probabilities for each response
log_probs = []
for resp in data["filtered_resps"]:
log_prob = np.array([float(option[0]) for option in resp])
log_probs.append(log_prob)
return log_probs
def load_run_data(model_name, dataset_name):
try:
model_name = model_name.replace("/", "__")
data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details",
name=model_name + "__leaderboard_" + dataset_name,
split="latest")
data = data.sort("doc_id")
data = data.to_dict()
# Get ground truth labels and logits
log_probs = filter_responses(data)
labels = filter_labels(dataset_name, data["doc"])
except Exception as e:
print(e)
log_probs = []
labels = []
return log_probs, labels
@lru_cache(maxsize=8)
def load_run_data_cached(model_name, dataset_name):
return load_run_data(model_name, dataset_name)
|