Spaces:
Running
Running
import datasets | |
import numpy as np | |
from huggingface_hub import HfApi | |
from functools import lru_cache | |
def get_leaderboard_models(): | |
#api = HfApi() | |
# List all datasets in the open-llm-leaderboard organization | |
#datasets = api.list_datasets(author="open-llm-leaderboard") | |
models = [] | |
#for dataset in datasets: | |
# if dataset.id.endswith("-details"): | |
# # Format: "open-llm-leaderboard/<provider>__<model_name>-details" | |
# model_part = dataset.id.split("/")[-1].replace("-details", "") | |
# provider, model = model_part.split("__", 1) | |
# models.append(f"{provider}/{model}") | |
# Example models | |
models = [ | |
"meta_llama/Llama-3.2-1B-Instruct", | |
"meta_llama/Llama-3.2-3B-Instruct", | |
"meta_llama/Llama-3.1-8B-Instruct", | |
"meta_llama/Llama-3.1-70B-Instruct", | |
"meta_llama/Llama-3.3-70B-Instruct", | |
] | |
return sorted(models) | |
def get_leaderboard_models_cached(): | |
return get_leaderboard_models() | |
def get_leaderboard_datasets(): | |
return [ | |
"ai2_arc", | |
"hellaswag", | |
"mmlu_pro", | |
"truthful_qa", | |
"winogrande", | |
"gsm8k" | |
] | |
def filter_labels(doc): | |
labels = [] | |
if "answer_index" in doc[0].keys(): | |
for d in doc: | |
labels.append(int(d["answer_index"])) | |
else: | |
for d in doc: | |
if d["answer"] == "False": | |
labels.append(0) | |
elif d["answer"] == "True": | |
labels.append(1) | |
else: | |
raise ValueError("Invalid label") | |
def load_run_data(model_name, dataset_name): | |
try: | |
model_name = model_name.replace("/", "__") | |
data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details", | |
name=model_name + "__leaderboard_" + dataset_name, | |
split="latest") | |
data = data.sort("doc_id") | |
data = data.to_dict() | |
# Get log probabilities for each response | |
log_probs = [] | |
for resp in data["filtered_resps"]: | |
log_prob = np.array([float(option[0]) for option in resp]) | |
log_probs.append(log_prob) | |
# Get ground truth labels | |
labels = filter_labels(data["doc"]) | |
except Exception as e: | |
print(e) | |
log_probs = [] | |
labels = [] | |
return log_probs, labels |