Spaces:
Running
Running
Joschka Strueber
commited on
Commit
·
c24946e
1
Parent(s):
26c0eec
[Fix] removal of not working benchmarks
Browse files- src/dataloading.py +1 -5
src/dataloading.py
CHANGED
@@ -87,7 +87,7 @@ def get_leaderboard_datasets(model_ids):
|
|
87 |
common_datasets = set.intersection(*model_datasets.values())
|
88 |
|
89 |
# Filter datasets that are not MCQ or currently do not work
|
90 |
-
ignore = ["
|
91 |
discard = []
|
92 |
for dataset in common_datasets:
|
93 |
for ignore_data in ignore:
|
@@ -104,7 +104,6 @@ def filter_labels(doc):
|
|
104 |
for d in doc:
|
105 |
labels.append(d["answer_index"])
|
106 |
else:
|
107 |
-
print("Target:", doc[0]["target"])
|
108 |
for d in doc:
|
109 |
if d["target"] == "False":
|
110 |
labels.append(0)
|
@@ -147,6 +146,3 @@ def load_run_data(model_name, dataset_name):
|
|
147 |
def load_run_data_cached(model_name, dataset_name):
|
148 |
return load_run_data(model_name, dataset_name)
|
149 |
|
150 |
-
|
151 |
-
if __name__ == "__main__":
|
152 |
-
get_leaderboard_models_reload()
|
|
|
87 |
common_datasets = set.intersection(*model_datasets.values())
|
88 |
|
89 |
# Filter datasets that are not MCQ or currently do not work
|
90 |
+
ignore = ["bbh_", "gpqa_", "math_", "ifeval"]
|
91 |
discard = []
|
92 |
for dataset in common_datasets:
|
93 |
for ignore_data in ignore:
|
|
|
104 |
for d in doc:
|
105 |
labels.append(d["answer_index"])
|
106 |
else:
|
|
|
107 |
for d in doc:
|
108 |
if d["target"] == "False":
|
109 |
labels.append(0)
|
|
|
146 |
def load_run_data_cached(model_name, dataset_name):
|
147 |
return load_run_data(model_name, dataset_name)
|
148 |
|
|
|
|
|
|