Update space
Browse files- app.py +2 -2
- src/leaderboard/read_evals.py +32 -10
- src/populate.py +3 -0
app.py
CHANGED
|
@@ -96,7 +96,7 @@ def init_leaderboard(dataframe):
|
|
| 96 |
interactive=False,
|
| 97 |
)
|
| 98 |
|
| 99 |
-
|
| 100 |
model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
| 101 |
|
| 102 |
def overall_leaderboard(dataframe):
|
|
@@ -129,7 +129,7 @@ with demo:
|
|
| 129 |
|
| 130 |
|
| 131 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
| 132 |
-
leaderboard =
|
| 133 |
|
| 134 |
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
|
| 135 |
|
|
|
|
| 96 |
interactive=False,
|
| 97 |
)
|
| 98 |
|
| 99 |
+
model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
|
| 100 |
model_leaderboard_df = get_model_leaderboard_df(model_result_path)
|
| 101 |
|
| 102 |
def overall_leaderboard(dataframe):
|
|
|
|
| 129 |
|
| 130 |
|
| 131 |
with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
|
| 132 |
+
leaderboard = overall_leaderboard(model_leaderboard_df)
|
| 133 |
|
| 134 |
with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
|
| 135 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -30,10 +30,10 @@ class ModelResult:
|
|
| 30 |
config = data.get("config")
|
| 31 |
# Get model and org
|
| 32 |
model = config.get("model_name")
|
| 33 |
-
org = config.get("
|
| 34 |
license = config.get("license")
|
| 35 |
knowledge_cutoff = config.get("knowledge_cutoff")
|
| 36 |
-
|
| 37 |
# Extract results available in this file (some results are split in several files)
|
| 38 |
results = {}
|
| 39 |
for domain in Domains:
|
|
@@ -75,8 +75,8 @@ class ModelResult:
|
|
| 75 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 76 |
}
|
| 77 |
|
| 78 |
-
for task in Tasks:
|
| 79 |
-
|
| 80 |
|
| 81 |
for domain in Domains:
|
| 82 |
data_dict[domain.value.col_name] = self.results[domain.value.dimension]
|
|
@@ -277,26 +277,48 @@ def get_raw_model_results(results_path: str) -> list[EvalResult]:
|
|
| 277 |
except:
|
| 278 |
data = eval(open(results_path).read()) # a list of dicts
|
| 279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
eval_results = {}
|
| 281 |
|
| 282 |
for result in data:
|
| 283 |
# Creation of result
|
| 284 |
eval_result = ModelResult.init_from_json_dict(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
-
# Store results of same eval together
|
| 287 |
eval_name = eval_result.eval_name
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
results = []
|
| 294 |
for v in eval_results.values():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
try:
|
| 296 |
v.to_dict() # we test if the dict version is complete
|
| 297 |
results.append(v)
|
| 298 |
except KeyError: # not all eval values present
|
| 299 |
continue
|
| 300 |
-
|
| 301 |
return results
|
| 302 |
|
|
|
|
| 30 |
config = data.get("config")
|
| 31 |
# Get model and org
|
| 32 |
model = config.get("model_name")
|
| 33 |
+
org = config.get("organization")
|
| 34 |
license = config.get("license")
|
| 35 |
knowledge_cutoff = config.get("knowledge_cutoff")
|
| 36 |
+
|
| 37 |
# Extract results available in this file (some results are split in several files)
|
| 38 |
results = {}
|
| 39 |
for domain in Domains:
|
|
|
|
| 75 |
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 76 |
}
|
| 77 |
|
| 78 |
+
# for task in Tasks:
|
| 79 |
+
# data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
| 80 |
|
| 81 |
for domain in Domains:
|
| 82 |
data_dict[domain.value.col_name] = self.results[domain.value.dimension]
|
|
|
|
| 277 |
except:
|
| 278 |
data = eval(open(results_path).read()) # a list of dicts
|
| 279 |
|
| 280 |
+
# print("data", len(data))
|
| 281 |
+
# print(data[0])
|
| 282 |
+
# {'config': {'model_name': 'ChatGPT-4o-latest (2024-09-03)',
|
| 283 |
+
# 'organization': 'OpenAI', 'license': 'Proprietary',
|
| 284 |
+
# 'knowledge_cutoff': '2023/10'},
|
| 285 |
+
# 'results': {'math-algebra':
|
| 286 |
+
# {'Score': 99.19484702, 'Avg Rank': 1.666666667, 'Min Rank': 1, 'Max Rank': 3},
|
| 287 |
+
# 'math-probability': {'Score': 100, 'Avg Rank': 1, 'Min Rank': 1, 'Max Rank': 1},
|
| 288 |
+
# 'reasoning-logical': {'Avg Rank': 1, 'Min Rank': 1, 'Max Rank': 1},
|
| 289 |
+
# 'overall': {'Avg Rank': 2, 'Min Rank': 2, 'Max Rank': 2}}}
|
| 290 |
eval_results = {}
|
| 291 |
|
| 292 |
for result in data:
|
| 293 |
# Creation of result
|
| 294 |
eval_result = ModelResult.init_from_json_dict(result)
|
| 295 |
+
# print(eval_result)
|
| 296 |
+
# ModelResult(eval_name='OpenAI_ChatGPT-4o-latest (2024-09-03)',
|
| 297 |
+
# full_model='OpenAI/ChatGPT-4o-latest (2024-09-03)',
|
| 298 |
+
# org='OpenAI', model='ChatGPT-4o-latest (2024-09-03)',
|
| 299 |
+
# results={'overall': None}, license='Proprietary', knowledge_cutoff='2023/10')
|
| 300 |
|
|
|
|
| 301 |
eval_name = eval_result.eval_name
|
| 302 |
+
eval_results[eval_name] = eval_result
|
| 303 |
+
|
| 304 |
+
# # Store results of same eval together
|
| 305 |
+
# if eval_name in eval_results.keys():
|
| 306 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 307 |
+
# else:
|
| 308 |
+
# eval_results[eval_name] = eval_result
|
| 309 |
|
| 310 |
results = []
|
| 311 |
for v in eval_results.values():
|
| 312 |
+
# print(v.to_dict())
|
| 313 |
+
# {'eval_name': 'OpenAI_ChatGPT-4o-latest (2024-09-03)',
|
| 314 |
+
# 'Model': '<a target="_blank" href="https://huggingface.co/OpenAI/ChatGPT-4o-latest (2024-09-03)"
|
| 315 |
+
# style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">OpenAI/ChatGPT-4o-latest (2024-09-03)</a>',
|
| 316 |
+
# 'Hub License': 'Proprietary', 'Organization': 'OpenAI', 'Knowledge cutoff': '2023/10', 'Overall': None}
|
| 317 |
try:
|
| 318 |
v.to_dict() # we test if the dict version is complete
|
| 319 |
results.append(v)
|
| 320 |
except KeyError: # not all eval values present
|
| 321 |
continue
|
| 322 |
+
|
| 323 |
return results
|
| 324 |
|
src/populate.py
CHANGED
|
@@ -15,6 +15,9 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
|
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
|
|
|
|
|
|
|
| 18 |
for col in cols:
|
| 19 |
if col not in df.columns:
|
| 20 |
df[col] = None
|
|
|
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 18 |
+
# print(cols) # []
|
| 19 |
+
# print(df.columns) # ['eval_name', 'Model', 'Hub License', 'Organization', 'Knowledge cutoff', 'Overall']
|
| 20 |
+
# exit()
|
| 21 |
for col in cols:
|
| 22 |
if col not in df.columns:
|
| 23 |
df[col] = None
|