Spaces:
Running
Running
File size: 2,447 Bytes
f3cd231 7fa11aa cdeeed6 7fa11aa f3cd231 7fa11aa 53d5dd8 4adb140 7fa11aa c192c72 7fa11aa 482c272 c192c72 482c272 53d5dd8 7fa11aa cdeeed6 e1a6930 cdeeed6 7fa11aa f3cd231 7fa11aa f3cd231 30bd486 f3cd231 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import datasets
import numpy as np
from huggingface_hub import HfApi
from functools import lru_cache
def get_leaderboard_models():
#api = HfApi()
# List all datasets in the open-llm-leaderboard organization
#datasets = api.list_datasets(author="open-llm-leaderboard")
models = []
#for dataset in datasets:
# if dataset.id.endswith("-details"):
# # Format: "open-llm-leaderboard/<provider>__<model_name>-details"
# model_part = dataset.id.split("/")[-1].replace("-details", "")
# provider, model = model_part.split("__", 1)
# models.append(f"{provider}/{model}")
# Example models
models = [
"meta_llama/Llama-3.2-1B-Instruct",
"meta_llama/Llama-3.2-3B-Instruct",
"meta_llama/Llama-3.1-8B-Instruct",
"meta_llama/Llama-3.1-70B-Instruct",
"meta_llama/Llama-3.3-70B-Instruct",
]
return sorted(models)
@lru_cache(maxsize=1)
def get_leaderboard_models_cached():
return get_leaderboard_models()
def get_leaderboard_datasets():
return [
"ai2_arc",
"hellaswag",
"mmlu_pro",
"truthful_qa",
"winogrande",
"gsm8k"
]
def filter_labels(doc):
labels = []
if "answer_index" in doc[0].keys():
for d in doc:
labels.append(int(d["answer_index"]))
else:
for d in doc:
if d["answer"] == "False":
labels.append(0)
elif d["answer"] == "True":
labels.append(1)
else:
raise ValueError("Invalid label")
def load_run_data(model_name, dataset_name):
try:
model_name = model_name.replace("/", "__")
data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details",
name=model_name + "__leaderboard_" + dataset_name,
split="latest")
data = data.sort("doc_id")
data = data.to_dict()
# Get log probabilities for each response
log_probs = []
for resp in data["filtered_resps"]:
log_prob = np.array([float(option[0]) for option in resp])
log_probs.append(log_prob)
# Get ground truth labels
labels = filter_labels(data["doc"])
except Exception as e:
print(e)
log_probs = []
labels = []
return log_probs, labels |