choco9966 commited on
Commit
07cd8a1
·
verified ·
1 Parent(s): d3bf055

Update src/leaderboard/read_evals.py

Browse files
Files changed (1) hide show
  1. src/leaderboard/read_evals.py +0 -24
src/leaderboard/read_evals.py CHANGED
@@ -103,13 +103,6 @@ class EvalResult:
103
  results[task.benchmark] = 0.0
104
  continue
105
 
106
- # New tasks have been added, we need to skip them if not exists
107
- if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
108
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
109
- if accs.size == 0 or any([acc is None for acc in accs]):
110
- results[task.benchmark] = 0.0
111
- continue
112
-
113
  # We average all scores of a given metric (mostly for mmlu)
114
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
115
  if accs.size == 0 or any([acc is None for acc in accs]):
@@ -154,23 +147,6 @@ class EvalResult:
154
  # Skip the new tasks for now
155
  # TODO: safely remove this code when the task results are all added
156
  skip_avg_len = 0
157
- if self.results['ko_winogrande'] == 0.0:
158
- skip_avg_len += 1
159
- if self.results['ko_gsm8k'] == 0.0:
160
- skip_avg_len += 1
161
- if self.results['ko_eq_bench'] == 0.0:
162
- skip_avg_len += 1
163
- if self.results['ko_inst_follow'] == 0.0:
164
- skip_avg_len += 1
165
- if self.results['kor_nat_cka'] == 0.0:
166
- skip_avg_len += 1
167
- if self.results['kor_nat_sva'] == 0.0:
168
- skip_avg_len += 1
169
- if self.results['ko_harmlessness'] == 0.0:
170
- skip_avg_len += 1
171
- if self.results['ko_helpfulness'] == 0.0:
172
- skip_avg_len += 1
173
-
174
  average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
175
 
176
  data_dict = {
 
103
  results[task.benchmark] = 0.0
104
  continue
105
 
 
 
 
 
 
 
 
106
  # We average all scores of a given metric (mostly for mmlu)
107
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
108
  if accs.size == 0 or any([acc is None for acc in accs]):
 
147
  # Skip the new tasks for now
148
  # TODO: safely remove this code when the task results are all added
149
  skip_avg_len = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
151
 
152
  data_dict = {