Add Solbench score
Browse files- src/display/utils.py +2 -1
- src/leaderboard/read_evals.py +7 -2
- src/populate.py +2 -1
src/display/utils.py
CHANGED
|
@@ -38,7 +38,8 @@ auto_eval_column_dict = [
|
|
| 38 |
"", "str", True, never_hidden=True)),
|
| 39 |
("model", ColumnContent, create_column_content(
|
| 40 |
"Model", "markdown", True, never_hidden=True)),
|
| 41 |
-
("
|
|
|
|
| 42 |
]
|
| 43 |
|
| 44 |
# Add task-specific columns
|
|
|
|
| 38 |
"", "str", True, never_hidden=True)),
|
| 39 |
("model", ColumnContent, create_column_content(
|
| 40 |
"Model", "markdown", True, never_hidden=True)),
|
| 41 |
+
("solbench", ColumnContent, create_column_content("Score", "number", True)),
|
| 42 |
+
# ("average", ColumnContent, create_column_content("Average", "number", True)),
|
| 43 |
]
|
| 44 |
|
| 45 |
# Add task-specific columns
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -114,7 +114,11 @@ class EvalResult:
|
|
| 114 |
|
| 115 |
def to_dict(self):
|
| 116 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 117 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
data_dict = {
|
| 119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 120 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -124,7 +128,8 @@ class EvalResult:
|
|
| 124 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 125 |
AutoEvalColumn.model.name: make_clickable_model(self.model_name),
|
| 126 |
AutoEvalColumn.revision.name: self.revision,
|
| 127 |
-
AutoEvalColumn.average.name: average,
|
|
|
|
| 128 |
AutoEvalColumn.license.name: self.license,
|
| 129 |
AutoEvalColumn.likes.name: self.likes,
|
| 130 |
AutoEvalColumn.params.name: self.num_params,
|
|
|
|
| 114 |
|
| 115 |
def to_dict(self):
|
| 116 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 117 |
+
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 118 |
+
solbench = sum([
|
| 119 |
+
self.results.get('naive_judge', 0),
|
| 120 |
+
self.results.get('human_eval_solidity', 0)
|
| 121 |
+
]) / 2
|
| 122 |
data_dict = {
|
| 123 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 124 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 128 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 129 |
AutoEvalColumn.model.name: make_clickable_model(self.model_name),
|
| 130 |
AutoEvalColumn.revision.name: self.revision,
|
| 131 |
+
# AutoEvalColumn.average.name: average,
|
| 132 |
+
AutoEvalColumn.solbench.name: solbench,
|
| 133 |
AutoEvalColumn.license.name: self.license,
|
| 134 |
AutoEvalColumn.likes.name: self.likes,
|
| 135 |
AutoEvalColumn.params.name: self.num_params,
|
src/populate.py
CHANGED
|
@@ -19,7 +19,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 19 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 20 |
|
| 21 |
df = pd.DataFrame.from_records(all_data_json)
|
| 22 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
|
| 23 |
df = df[cols].round(decimals=2)
|
| 24 |
|
| 25 |
# filter out if any of the benchmarks have not been produced
|
|
|
|
| 19 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 20 |
|
| 21 |
df = pd.DataFrame.from_records(all_data_json)
|
| 22 |
+
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 23 |
+
df = df.sort_values(by=[AutoEvalColumn.solbench.name], ascending=False)
|
| 24 |
df = df[cols].round(decimals=2)
|
| 25 |
|
| 26 |
# filter out if any of the benchmarks have not been produced
|