tathagataraha commited on
Commit
57fd1ce
·
1 Parent(s): 3df6003

[ADD] CI for open-ended

Browse files
Files changed (2) hide show
  1. src/about.py +4 -3
  2. src/leaderboard/read_evals.py +7 -13
src/about.py CHANGED
@@ -34,9 +34,10 @@ class OpenEndedColumn:
34
 
35
  class OpenEndedColumns(Enum):
36
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
37
- column0 = OpenEndedColumn("ELO", "score", "ELO")
38
- column1 = OpenEndedColumn("Score", "score", "Score")
39
-
 
40
  # changes to be made here
41
 
42
  @dataclass
 
34
 
35
  class OpenEndedColumns(Enum):
36
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
37
+ column0 = OpenEndedColumn("ELO", "score", "ELO")
38
+ column1 = OpenEndedColumn("ELO_intervals", "score", "ELO 95% CI")
39
+ column2 = OpenEndedColumn("Score", "score", "Score")
40
+ column3 = OpenEndedColumn("Score_intervals", "score", "Score 95% CI")
41
  # changes to be made here
42
 
43
  @dataclass
src/leaderboard/read_evals.py CHANGED
@@ -106,25 +106,19 @@ class EvalResult:
106
  for task in OpenEndedColumns:
107
  task = task.value
108
  # We average all scores of a given metric (not all metrics are present in all files)
109
- accs = np.array([v for k, v in data["results"]["open-ended"]["overall"].items() if task.benchmark == k])
110
- if accs.size == 0 or any([acc is None for acc in accs]):
111
- continue
112
- mean_acc = np.mean(accs) # * 100.0
113
- open_ended_results[task.benchmark] = mean_acc
114
  # breakpoint()
115
  # changes to be made here
116
  med_safety_results = {}
117
  if "med-safety" in data["results"]:
118
  for task in MedSafetyColumns:
119
  task = task.value
120
- try:
121
- accs = np.array([v.get(task.metric, None) for k, v in data["results"]["med-safety"].items() if task.benchmark == k])
122
- except:
123
- accs = np.array([])
124
- if accs.size == 0 or any([acc is None for acc in accs]):
125
- continue
126
- mean_acc = np.mean(accs) # * 100.0
127
- med_safety_results[task.benchmark] = mean_acc
128
  medical_summarization_results = {}
129
  if "medical-summarization" in data["results"]:
130
  for task in MedicalSummarizationColumns:
 
106
  for task in OpenEndedColumns:
107
  task = task.value
108
  # We average all scores of a given metric (not all metrics are present in all files)
109
+ accs = data["results"]["open-ended"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended"]["overall"] else None
110
+ open_ended_results[task.benchmark] = accs
111
+ if open_ended_results["ELO_intervals"] is not None and open_ended_results["Score_intervals"] is not None:
112
+ open_ended_results["ELO_intervals"] = "+" + str(open_ended_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_results["ELO_intervals"][0]))
113
+ open_ended_results["Score_intervals"] = "+" + str(open_ended_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_results["Score_intervals"][0]))
114
  # breakpoint()
115
  # changes to be made here
116
  med_safety_results = {}
117
  if "med-safety" in data["results"]:
118
  for task in MedSafetyColumns:
119
  task = task.value
120
+ accs = data["results"]["med-safety"][task.benchmark]["score"]
121
+ med_safety_results[task.benchmark] = accs
 
 
 
 
 
 
122
  medical_summarization_results = {}
123
  if "medical-summarization" in data["results"]:
124
  for task in MedicalSummarizationColumns: