Spaces:
Running
Running
Commit
·
57fd1ce
1
Parent(s):
3df6003
[ADD] CI for open-ended
Browse files- src/about.py +4 -3
- src/leaderboard/read_evals.py +7 -13
src/about.py
CHANGED
@@ -34,9 +34,10 @@ class OpenEndedColumn:
|
|
34 |
|
35 |
class OpenEndedColumns(Enum):
|
36 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
37 |
-
column0 = OpenEndedColumn("ELO", "score", "ELO")
|
38 |
-
column1 = OpenEndedColumn("
|
39 |
-
|
|
|
40 |
# changes to be made here
|
41 |
|
42 |
@dataclass
|
|
|
34 |
|
35 |
class OpenEndedColumns(Enum):
|
36 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
37 |
+
column0 = OpenEndedColumn("ELO", "score", "ELO")
|
38 |
+
column1 = OpenEndedColumn("ELO_intervals", "score", "ELO 95% CI")
|
39 |
+
column2 = OpenEndedColumn("Score", "score", "Score")
|
40 |
+
column3 = OpenEndedColumn("Score_intervals", "score", "Score 95% CI")
|
41 |
# changes to be made here
|
42 |
|
43 |
@dataclass
|
src/leaderboard/read_evals.py
CHANGED
@@ -106,25 +106,19 @@ class EvalResult:
|
|
106 |
for task in OpenEndedColumns:
|
107 |
task = task.value
|
108 |
# We average all scores of a given metric (not all metrics are present in all files)
|
109 |
-
accs =
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
open_ended_results[
|
114 |
# breakpoint()
|
115 |
# changes to be made here
|
116 |
med_safety_results = {}
|
117 |
if "med-safety" in data["results"]:
|
118 |
for task in MedSafetyColumns:
|
119 |
task = task.value
|
120 |
-
|
121 |
-
|
122 |
-
except:
|
123 |
-
accs = np.array([])
|
124 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
125 |
-
continue
|
126 |
-
mean_acc = np.mean(accs) # * 100.0
|
127 |
-
med_safety_results[task.benchmark] = mean_acc
|
128 |
medical_summarization_results = {}
|
129 |
if "medical-summarization" in data["results"]:
|
130 |
for task in MedicalSummarizationColumns:
|
|
|
106 |
for task in OpenEndedColumns:
|
107 |
task = task.value
|
108 |
# We average all scores of a given metric (not all metrics are present in all files)
|
109 |
+
accs = data["results"]["open-ended"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended"]["overall"] else None
|
110 |
+
open_ended_results[task.benchmark] = accs
|
111 |
+
if open_ended_results["ELO_intervals"] is not None and open_ended_results["Score_intervals"] is not None:
|
112 |
+
open_ended_results["ELO_intervals"] = "+" + str(open_ended_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_results["ELO_intervals"][0]))
|
113 |
+
open_ended_results["Score_intervals"] = "+" + str(open_ended_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_results["Score_intervals"][0]))
|
114 |
# breakpoint()
|
115 |
# changes to be made here
|
116 |
med_safety_results = {}
|
117 |
if "med-safety" in data["results"]:
|
118 |
for task in MedSafetyColumns:
|
119 |
task = task.value
|
120 |
+
accs = data["results"]["med-safety"][task.benchmark]["score"]
|
121 |
+
med_safety_results[task.benchmark] = accs
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
medical_summarization_results = {}
|
123 |
if "medical-summarization" in data["results"]:
|
124 |
for task in MedicalSummarizationColumns:
|