Fix metrics
Browse files
app.py
CHANGED
|
@@ -37,11 +37,17 @@ def get_leaderboard_df():
|
|
| 37 |
data = json.load(file)
|
| 38 |
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
| 39 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
| 40 |
-
if task == "truthfulqa":
|
| 41 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
| 42 |
# IFEval has several metrics but we report just the prompt-loose-acc one
|
| 43 |
-
elif task == "ifeval":
|
| 44 |
value = data["results"][first_result_key]["prompt_level_loose_acc"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
else:
|
| 46 |
first_metric_key = next(
|
| 47 |
iter(data["results"][first_result_key])
|
|
|
|
| 37 |
data = json.load(file)
|
| 38 |
first_result_key = next(iter(data["results"])) # gets the first key in 'results'
|
| 39 |
# TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
|
| 40 |
+
if task.lower() == "truthfulqa":
|
| 41 |
value = data["results"][first_result_key]["truthfulqa_mc2"]
|
| 42 |
# IFEval has several metrics but we report just the prompt-loose-acc one
|
| 43 |
+
elif task.lower() == "ifeval":
|
| 44 |
value = data["results"][first_result_key]["prompt_level_loose_acc"]
|
| 45 |
+
# MMLU has several metrics but we report just the average one
|
| 46 |
+
elif task.lower() == "mmlu":
|
| 47 |
+
value = data["results"]["lighteval|mmlu:_average|5"]["acc"]
|
| 48 |
+
# HellaSwag and ARC reports acc_norm
|
| 49 |
+
elif task.lower() in ["hellaswag", "arc"]:
|
| 50 |
+
value = data["results"][first_result_key]["acc_norm"]
|
| 51 |
else:
|
| 52 |
first_metric_key = next(
|
| 53 |
iter(data["results"][first_result_key])
|