Spaces:
Running
Running
Joschka Strueber
[Add, Ref] integrate similarity computation, fix one-hot for EC, add login option
0f7de99
import datasets | |
import numpy as np | |
from huggingface_hub import HfApi | |
from functools import lru_cache | |
def get_leaderboard_models(): | |
api = HfApi() | |
# List all datasets in the open-llm-leaderboard organization | |
datasets = api.list_datasets(author="open-llm-leaderboard") | |
models = [] | |
for dataset in datasets: | |
if dataset.id.endswith("-details"): | |
# Format: "open-llm-leaderboard/<provider>__<model_name>-details" | |
model_part = dataset.id.split("/")[-1].replace("-details", "") | |
if "__" in model_part: | |
provider, model = model_part.split("__", 1) | |
models.append(f"{provider}/{model}") | |
else: | |
models.append(model_part) | |
return sorted(models) | |
def get_leaderboard_models_cached(): | |
return get_leaderboard_models() | |
def get_leaderboard_datasets(model_ids): | |
if model_ids is None: | |
return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'ifeval', 'math_algebra_hard', 'math_counting_and_prob_hard', 'math_geometry_hard', 'math_intermediate_algebra_hard', 'math_num_theory_hard', 'math_prealgebra_hard', 'math_precalculus_hard', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation'] | |
# Map each model to its corresponding leaderboard version | |
leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids] | |
model_datasets = {} | |
for model_id in leaderboard_model_ids: | |
# Retrieve the list of available configuration names | |
config_names = datasets.get_dataset_config_names(model_id) | |
dataset_names = [name.split("__leaderboard_")[-1] for name in config_names] | |
model_datasets[model_id] = set(dataset_names) | |
# Compute the intersection of datasets across all models | |
if model_datasets: | |
common_datasets = set.intersection(*model_datasets.values()) | |
return sorted(common_datasets) | |
def filter_labels(doc): | |
labels = [] | |
if "answer_index" in doc[0].keys(): | |
for d in doc: | |
labels.append(d["answer_index"]) | |
else: | |
for d in doc: | |
if d["answer"] == "False": | |
labels.append(0) | |
elif d["answer"] == "True": | |
labels.append(1) | |
else: | |
raise ValueError("Invalid label") | |
return labels | |
def load_run_data(model_name, dataset_name): | |
try: | |
model_name = model_name.replace("/", "__") | |
data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details", | |
name=model_name + "__leaderboard_" + dataset_name, | |
split="latest") | |
data = data.sort("doc_id") | |
data = data.to_dict() | |
# Get log probabilities for each response | |
log_probs = [] | |
for resp in data["filtered_resps"]: | |
log_prob = np.array([float(option[0]) for option in resp]) | |
log_probs.append(log_prob) | |
# Get ground truth labels | |
labels = filter_labels(data["doc"]) | |
except Exception as e: | |
print(e) | |
log_probs = [] | |
labels = [] | |
return log_probs, labels | |