|  | import os | 
					
						
						|  | import shutil | 
					
						
						|  | import numpy as np | 
					
						
						|  | import gradio as gr | 
					
						
						|  | from huggingface_hub import Repository, HfApi | 
					
						
						|  | from transformers import AutoConfig, AutoModel | 
					
						
						|  | import json | 
					
						
						|  | from apscheduler.schedulers.background import BackgroundScheduler | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import datetime | 
					
						
						|  | import glob | 
					
						
						|  | from dataclasses import dataclass | 
					
						
						|  | from typing import List, Tuple, Dict | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | H4_TOKEN = os.environ.get("H4_TOKEN", None) | 
					
						
						|  | LMEH_REPO = "HuggingFaceH4/lmeh_evaluations" | 
					
						
						|  |  | 
					
						
						|  | METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] | 
					
						
						|  | BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"] | 
					
						
						|  | BENCH_TO_NAME = { | 
					
						
						|  | "arc_challenge": "ARC (25-shot) ⬆️", | 
					
						
						|  | "hellaswag": "HellaSwag (10-shot) ⬆️", | 
					
						
						|  | "hendrycks": "MMLU (5-shot) ⬆️", | 
					
						
						|  | "truthfulqa_mc": "TruthfulQA (0-shot) ⬆️", | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def make_clickable_model(model_name): | 
					
						
						|  | LLAMAS = [ | 
					
						
						|  | "huggingface/llama-7b", | 
					
						
						|  | "huggingface/llama-13b", | 
					
						
						|  | "huggingface/llama-30b", | 
					
						
						|  | "huggingface/llama-65b", | 
					
						
						|  | ] | 
					
						
						|  | if model_name in LLAMAS: | 
					
						
						|  | model = model_name.split("/")[1] | 
					
						
						|  | return f'<a target="_blank" href="https://ai.facebook.com/blog/large-language-model-llama-meta-ai/" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model}</a>' | 
					
						
						|  |  | 
					
						
						|  | if model_name == "HuggingFaceH4/stable-vicuna-13b-2904": | 
					
						
						|  | link = "https://huggingface.co/" + "CarperAI/stable-vicuna-13b-delta" | 
					
						
						|  | return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">stable-vicuna-13b</a>' | 
					
						
						|  |  | 
					
						
						|  | if model_name == "HuggingFaceH4/llama-7b-ift-alpaca": | 
					
						
						|  | link = "https://crfm.stanford.edu/2023/03/13/alpaca.html" | 
					
						
						|  | return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">alpaca-13b</a>' | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | link = "https://huggingface.co/" + model_name | 
					
						
						|  | return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @dataclass | 
					
						
						|  | class EvalResult: | 
					
						
						|  | eval_name: str | 
					
						
						|  | org: str | 
					
						
						|  | model: str | 
					
						
						|  | revision: str | 
					
						
						|  | is_8bit: bool | 
					
						
						|  | results: dict | 
					
						
						|  |  | 
					
						
						|  | def to_dict(self): | 
					
						
						|  | if self.org is not None: | 
					
						
						|  | base_model = f"{self.org}/{self.model}" | 
					
						
						|  | else: | 
					
						
						|  | base_model = f"{self.model}" | 
					
						
						|  | data_dict = {} | 
					
						
						|  |  | 
					
						
						|  | data_dict["eval_name"] = self.eval_name | 
					
						
						|  | data_dict["8bit"] = self.is_8bit | 
					
						
						|  | data_dict["Model"] = make_clickable_model(base_model) | 
					
						
						|  |  | 
					
						
						|  | data_dict["model_name_for_query"] = base_model | 
					
						
						|  | data_dict["Revision"] = self.revision | 
					
						
						|  | data_dict["Average ⬆️"] = round( | 
					
						
						|  | sum([v for k, v in self.results.items()]) / 4.0, 1 | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for benchmark in BENCHMARKS: | 
					
						
						|  | if not benchmark in self.results.keys(): | 
					
						
						|  | self.results[benchmark] = None | 
					
						
						|  |  | 
					
						
						|  | for k, v in BENCH_TO_NAME.items(): | 
					
						
						|  | data_dict[v] = self.results[k] | 
					
						
						|  |  | 
					
						
						|  | return data_dict | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def parse_eval_result(json_filepath: str) -> Tuple[str, dict]: | 
					
						
						|  | with open(json_filepath) as fp: | 
					
						
						|  | data = json.load(fp) | 
					
						
						|  |  | 
					
						
						|  | path_split = json_filepath.split("/") | 
					
						
						|  | org = None | 
					
						
						|  | model = path_split[-4] | 
					
						
						|  | is_8bit = path_split[-2] == "8bit" | 
					
						
						|  | revision = path_split[-3] | 
					
						
						|  | if len(path_split) == 7: | 
					
						
						|  |  | 
					
						
						|  | result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}" | 
					
						
						|  | else: | 
					
						
						|  | result_key = ( | 
					
						
						|  | f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}" | 
					
						
						|  | ) | 
					
						
						|  | org = path_split[-5] | 
					
						
						|  |  | 
					
						
						|  | eval_result = None | 
					
						
						|  | for benchmark, metric in zip(BENCHMARKS, METRICS): | 
					
						
						|  | if benchmark in json_filepath: | 
					
						
						|  | accs = np.array([v[metric] for k, v in data["results"].items()]) | 
					
						
						|  | mean_acc = round(np.mean(accs) * 100.0, 1) | 
					
						
						|  | eval_result = EvalResult( | 
					
						
						|  | result_key, org, model, revision, is_8bit, {benchmark: mean_acc} | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return result_key, eval_result | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_eval_results(is_public) -> List[EvalResult]: | 
					
						
						|  | json_filepaths = glob.glob( | 
					
						
						|  | "evals/eval_results/public/**/16bit/*.json", recursive=True | 
					
						
						|  | ) | 
					
						
						|  | if not is_public: | 
					
						
						|  | json_filepaths += glob.glob( | 
					
						
						|  | "evals/eval_results/private/**/*.json", recursive=True | 
					
						
						|  | ) | 
					
						
						|  | json_filepaths += glob.glob( | 
					
						
						|  | "evals/eval_results/private/**/*.json", recursive=True | 
					
						
						|  | ) | 
					
						
						|  | json_filepaths += glob.glob( | 
					
						
						|  | "evals/eval_results/public/**/8bit/*.json", recursive=True | 
					
						
						|  | ) | 
					
						
						|  | eval_results = {} | 
					
						
						|  |  | 
					
						
						|  | for json_filepath in json_filepaths: | 
					
						
						|  | result_key, eval_result = parse_eval_result(json_filepath) | 
					
						
						|  | if result_key in eval_results.keys(): | 
					
						
						|  | eval_results[result_key].results.update(eval_result.results) | 
					
						
						|  | else: | 
					
						
						|  | eval_results[result_key] = eval_result | 
					
						
						|  |  | 
					
						
						|  | eval_results = [v for k, v in eval_results.items()] | 
					
						
						|  |  | 
					
						
						|  | return eval_results | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_eval_results_dicts(is_public=True) -> List[Dict]: | 
					
						
						|  | eval_results = get_eval_results(is_public) | 
					
						
						|  |  | 
					
						
						|  | return [e.to_dict() for e in eval_results] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | eval_results_dict = get_eval_results_dicts() | 
					
						
						|  |  | 
					
						
						|  |  |