import datasets import numpy as np from huggingface_hub import HfApi from functools import lru_cache def get_leaderboard_models(): #api = HfApi() # List all datasets in the open-llm-leaderboard organization #datasets = api.list_datasets(author="open-llm-leaderboard") models = [] #for dataset in datasets: # if dataset.id.endswith("-details"): # # Format: "open-llm-leaderboard/__-details" # model_part = dataset.id.split("/")[-1].replace("-details", "") # provider, model = model_part.split("__", 1) # models.append(f"{provider}/{model}") # Example models models = [ "meta_llama/Llama-3.2-1B-Instruct", "meta_llama/Llama-3.2-3B-Instruct", "meta_llama/Llama-3.1-8B-Instruct", "meta_llama/Llama-3.1-70B-Instruct", "meta_llama/Llama-3.3-70B-Instruct", ] return sorted(models) @lru_cache(maxsize=1) def get_leaderboard_models_cached(): return get_leaderboard_models() def get_leaderboard_datasets(): return [ "ai2_arc", "hellaswag", "mmlu_pro", "truthful_qa", "winogrande", "gsm8k" ] def filter_labels(doc): labels = [] if "answer_index" in doc[0].keys(): for d in doc: labels.append(int(d["answer_index"])) else: for d in doc: if d["answer"] == "False": labels.append(0) elif d["answer"] == "True": labels.append(1) else: raise ValueError("Invalid label") def load_run_data(model_name, dataset_name): try: model_name = model_name.replace("/", "__") data = datasets.load_dataset("open-llm-leaderboard/" + model_name + "-details", name=model_name + "__leaderboard_" + dataset_name, split="latest") data = data.sort("doc_id") data = data.to_dict() # Get log probabilities for each response log_probs = [] for resp in data["filtered_resps"]: log_prob = np.array([float(option[0]) for option in resp]) log_probs.append(log_prob) # Get ground truth labels labels = filter_labels(data["doc"]) except Exception as e: print(e) log_probs = [] labels = [] return log_probs, labels