Joschka Strueber commited on
Commit
c24946e
·
1 Parent(s): 26c0eec

[Fix] removal of not working benchmarks

Browse files
Files changed (1) hide show
  1. src/dataloading.py +1 -5
src/dataloading.py CHANGED
@@ -87,7 +87,7 @@ def get_leaderboard_datasets(model_ids):
87
  common_datasets = set.intersection(*model_datasets.values())
88
 
89
  # Filter datasets that are not MCQ or currently do not work
90
- ignore = ["_bbh_", "_gpqa_", "_math_", "_ifeval_"]
91
  discard = []
92
  for dataset in common_datasets:
93
  for ignore_data in ignore:
@@ -104,7 +104,6 @@ def filter_labels(doc):
104
  for d in doc:
105
  labels.append(d["answer_index"])
106
  else:
107
- print("Target:", doc[0]["target"])
108
  for d in doc:
109
  if d["target"] == "False":
110
  labels.append(0)
@@ -147,6 +146,3 @@ def load_run_data(model_name, dataset_name):
147
  def load_run_data_cached(model_name, dataset_name):
148
  return load_run_data(model_name, dataset_name)
149
 
150
-
151
- if __name__ == "__main__":
152
- get_leaderboard_models_reload()
 
87
  common_datasets = set.intersection(*model_datasets.values())
88
 
89
  # Filter datasets that are not MCQ or currently do not work
90
+ ignore = ["bbh_", "gpqa_", "math_", "ifeval"]
91
  discard = []
92
  for dataset in common_datasets:
93
  for ignore_data in ignore:
 
104
  for d in doc:
105
  labels.append(d["answer_index"])
106
  else:
 
107
  for d in doc:
108
  if d["target"] == "False":
109
  labels.append(0)
 
146
  def load_run_data_cached(model_name, dataset_name):
147
  return load_run_data(model_name, dataset_name)
148