Spaces:
Running
Running
File size: 4,210 Bytes
f3cd231 7fa11aa a48b15f cdeeed6 7fa11aa a48b15f 7fa11aa 53d5dd8 a48b15f 7fa11aa a48b15f 5d4059c 7fa11aa 53d5dd8 7fa11aa cdeeed6 e1a6930 cdeeed6 a48b15f f3cd231 0f7de99 f3cd231 0f7de99 f3cd231 75b9622 f3cd231 30bd486 f3cd231 a48b15f 0f7de99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import datasets
import numpy as np
from huggingface_hub import HfApi
from functools import lru_cache
def get_leaderboard_models():
api = HfApi()
# List all datasets in the open-llm-leaderboard organization
datasets = api.list_datasets(author="open-llm-leaderboard")
models = []
for dataset in datasets:
if dataset.id.endswith("-details"):
dataset_id = dataset.id
try:
# Check if the dataset can be loaded
check_gated = datasets.get_dataset_config_names(dataset_id)
# Format: "open-llm-leaderboard/<provider>__<model_name>-details"
model_part = dataset.id.split("/")[-1].replace("-details", "")
if "__" in model_part:
provider, model = model_part.split("__", 1)
models.append(f"{provider}/{model}")
else:
models.append(model_part)
except Exception as e:
pass
return sorted(models)
@lru_cache(maxsize=1)
def get_leaderboard_models_cached():
return get_leaderboard_models()
def get_leaderboard_datasets(model_ids):
if model_ids is None:
return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'ifeval', 'math_algebra_hard', 'math_counting_and_prob_hard', 'math_geometry_hard', 'math_intermediate_algebra_hard', 'math_num_theory_hard', 'math_prealgebra_hard', 'math_precalculus_hard', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
# Map each model to its corresponding leaderboard version
leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
model_datasets = {}
for model_id in leaderboard_model_ids:
# Retrieve the list of available configuration names
config_names = datasets.get_dataset_config_names(model_id)
dataset_names = [name.split("__leaderboard_")[-1] for name in config_names]
model_datasets[model_id] = set(dataset_names)
# Compute the intersection of datasets across all models
if model_datasets:
common_datasets = set.intersection(*model_datasets.values())
return sorted(common_datasets)
def filter_labels(doc):
labels = []
if "answer_index" in doc[0].keys():
for d in doc:
labels.append(d["answer_index"])
else:
for d in doc:
if d["answer"] == "False":
labels.append(0)
elif d["answer"] == "True":
labels.append(1)
else:
raise ValueError("Invalid label")
return labels
def load_run_data(model_name, dataset_name):
try:
model_name = model_name.replace("/", "__")
data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details",
name=model_name + "__leaderboard_" + dataset_name,
split="latest")
data = data.sort("doc_id")
data = data.to_dict()
# Get log probabilities for each response
log_probs = []
for resp in data["filtered_resps"]:
log_prob = np.array([float(option[0]) for option in resp])
log_probs.append(log_prob)
# Get ground truth labels
labels = filter_labels(data["doc"])
except Exception as e:
print(e)
log_probs = []
labels = []
return log_probs, labels
|