Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
edbeeching
commited on
Commit
·
fcb01e3
1
Parent(s):
b2c063a
updates table to include revision
Browse files
app.py
CHANGED
|
@@ -46,8 +46,8 @@ def load_results(model, benchmark, metric):
|
|
| 46 |
return mean_acc, data["config"]["model_args"]
|
| 47 |
|
| 48 |
|
| 49 |
-
COLS = ["
|
| 50 |
-
TYPES = ["str",
|
| 51 |
|
| 52 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
| 53 |
EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
|
|
@@ -59,7 +59,7 @@ def get_leaderboard():
|
|
| 59 |
all_data = get_eval_results_dicts()
|
| 60 |
dataframe = pd.DataFrame.from_records(all_data)
|
| 61 |
dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
|
| 62 |
-
|
| 63 |
dataframe = dataframe[COLS]
|
| 64 |
return dataframe
|
| 65 |
|
|
|
|
| 46 |
return mean_acc, data["config"]["model_args"]
|
| 47 |
|
| 48 |
|
| 49 |
+
COLS = ["base_model", "revision", "8bit", "total ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️"]
|
| 50 |
+
TYPES = ["markdown","str", "bool", "number", "number", "number", "number", "number", ]
|
| 51 |
|
| 52 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
| 53 |
EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
|
|
|
|
| 59 |
all_data = get_eval_results_dicts()
|
| 60 |
dataframe = pd.DataFrame.from_records(all_data)
|
| 61 |
dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
|
| 62 |
+
print(dataframe)
|
| 63 |
dataframe = dataframe[COLS]
|
| 64 |
return dataframe
|
| 65 |
|
utils.py
CHANGED
|
@@ -50,6 +50,7 @@ class EvalResult:
|
|
| 50 |
eval_name : str
|
| 51 |
org : str
|
| 52 |
model : str
|
|
|
|
| 53 |
is_8bit : bool
|
| 54 |
results : dict
|
| 55 |
|
|
@@ -60,8 +61,11 @@ class EvalResult:
|
|
| 60 |
else:
|
| 61 |
base_model =f"{self.model}"
|
| 62 |
data_dict = {}
|
|
|
|
| 63 |
data_dict["eval_name"] = self.eval_name
|
|
|
|
| 64 |
data_dict["base_model"] = make_clickable_model(base_model)
|
|
|
|
| 65 |
data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
|
| 66 |
data_dict["# params"] = get_n_params(base_model)
|
| 67 |
|
|
@@ -83,21 +87,22 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
|
|
| 83 |
|
| 84 |
path_split = json_filepath.split("/")
|
| 85 |
org = None
|
| 86 |
-
model = path_split[-
|
| 87 |
is_8bit = path_split[-2] == "8bit"
|
| 88 |
-
|
|
|
|
| 89 |
# handles gpt2 type models that don't have an org
|
| 90 |
-
result_key = f"{path_split[-3]}_{path_split[-2]}"
|
| 91 |
-
else:
|
| 92 |
result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
|
| 95 |
eval_result = None
|
| 96 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
| 97 |
if benchmark in json_filepath:
|
| 98 |
accs = np.array([v[metric] for k, v in data["results"].items()])
|
| 99 |
mean_acc = round(np.mean(accs),3)
|
| 100 |
-
eval_result = EvalResult(result_key, org, model, is_8bit, {benchmark:mean_acc})
|
| 101 |
|
| 102 |
return result_key, eval_result
|
| 103 |
|
|
|
|
| 50 |
eval_name : str
|
| 51 |
org : str
|
| 52 |
model : str
|
| 53 |
+
revision : str
|
| 54 |
is_8bit : bool
|
| 55 |
results : dict
|
| 56 |
|
|
|
|
| 61 |
else:
|
| 62 |
base_model =f"{self.model}"
|
| 63 |
data_dict = {}
|
| 64 |
+
|
| 65 |
data_dict["eval_name"] = self.eval_name
|
| 66 |
+
data_dict["8bit"] = self.is_8bit
|
| 67 |
data_dict["base_model"] = make_clickable_model(base_model)
|
| 68 |
+
data_dict["revision"] = self.revision
|
| 69 |
data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
|
| 70 |
data_dict["# params"] = get_n_params(base_model)
|
| 71 |
|
|
|
|
| 87 |
|
| 88 |
path_split = json_filepath.split("/")
|
| 89 |
org = None
|
| 90 |
+
model = path_split[-4]
|
| 91 |
is_8bit = path_split[-2] == "8bit"
|
| 92 |
+
revision = path_split[-3]
|
| 93 |
+
if len(path_split)== 6:
|
| 94 |
# handles gpt2 type models that don't have an org
|
|
|
|
|
|
|
| 95 |
result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
|
| 96 |
+
else:
|
| 97 |
+
result_key = f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
|
| 98 |
+
org = path_split[-5]
|
| 99 |
|
| 100 |
eval_result = None
|
| 101 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
| 102 |
if benchmark in json_filepath:
|
| 103 |
accs = np.array([v[metric] for k, v in data["results"].items()])
|
| 104 |
mean_acc = round(np.mean(accs),3)
|
| 105 |
+
eval_result = EvalResult(result_key, org, model, revision, is_8bit, {benchmark:mean_acc})
|
| 106 |
|
| 107 |
return result_key, eval_result
|
| 108 |
|