Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
47170a5
1
Parent(s):
276ec94
MMLU data loader for 3 parallel datasets
Browse files- evals/datasets_/mmlu.py +87 -17
- evals/tasks.py +8 -5
- uv.lock +1 -1
evals/datasets_/mmlu.py
CHANGED
|
@@ -1,20 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from joblib.memory import Memory
|
| 2 |
-
from
|
| 3 |
from rich import print
|
| 4 |
-
|
| 5 |
-
from collections import defaultdict, Counter
|
| 6 |
cache = Memory(location=".cache", verbose=0).cache
|
| 7 |
|
|
|
|
| 8 |
@cache
|
| 9 |
def _get_dataset_config_names(dataset):
|
| 10 |
return get_dataset_config_names(dataset)
|
| 11 |
|
|
|
|
| 12 |
@cache
|
| 13 |
def _load_dataset(dataset, subset, **kwargs):
|
| 14 |
return load_dataset(dataset, subset, **kwargs)
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def print_datasets_analysis():
|
| 20 |
print("Category counts and sample counts per dataset:")
|
|
@@ -24,7 +32,7 @@ def print_datasets_analysis():
|
|
| 24 |
langs1 = _get_dataset_config_names(slug1)
|
| 25 |
langs1 = [standardize_tag(a, macro=True) for a in langs1]
|
| 26 |
|
| 27 |
-
slug2 = "openai/MMMLU"
|
| 28 |
ds2 = _load_dataset(slug2, "FR_FR")
|
| 29 |
print_counts(slug2, [], ds2["test"]["Subject"])
|
| 30 |
langs2 = _get_dataset_config_names(slug2)
|
|
@@ -39,16 +47,27 @@ def print_datasets_analysis():
|
|
| 39 |
|
| 40 |
slug4 = "lighteval/okapi_mmlu"
|
| 41 |
ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
|
| 42 |
-
print_counts(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
langs4 = _get_dataset_config_names(slug4)
|
| 44 |
|
| 45 |
-
|
| 46 |
slug5 = "Eurolingua/mmlux"
|
| 47 |
subsets = _get_dataset_config_names(slug5)
|
| 48 |
subjects = set(a.rsplit("_", 1)[0] for a in subsets)
|
| 49 |
-
rows_test = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
rows_test = [a.split("/")[0] for l in rows_test for a in l]
|
| 51 |
-
rows_dev = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
rows_dev = [a.split("/")[0] for l in rows_dev for a in l]
|
| 53 |
print_counts(slug5, rows_dev, rows_test)
|
| 54 |
langs5 = list(set(a.rsplit("_", 1)[1].split("-")[0].lower() for a in subsets))
|
|
@@ -70,21 +89,72 @@ def print_datasets_analysis():
|
|
| 70 |
print(len(set(langs)))
|
| 71 |
|
| 72 |
print("Datasets per language for languages that are not in Global-MMLU:")
|
| 73 |
-
print(
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
print(list(set(ds1["test"]["subject"])))
|
| 76 |
|
|
|
|
| 77 |
# based on this analysis:
|
| 78 |
# - we drop the OpenAI dataset, since it does not have a dev set, and since every language that it has is also present in Global-MMLU
|
| 79 |
# - we stick to the 5 categories of the AfriMMLU dataset, since this is the most restricted dataset, and these 5 categories are present in all datasets, so this is good for comparability
|
| 80 |
|
| 81 |
# AfriMMLU is human-translated, but has only 5 task categories
|
| 82 |
-
# Global-MMLU is
|
| 83 |
# Okapi-MMLU is translated using ChatGPT (version unclear)
|
| 84 |
# MMLUX is translated using DeepL
|
| 85 |
-
# Therefore, the priority is: AfriMMLU, Global-MMLU, Okapi-MMLU
|
| 86 |
|
| 87 |
-
print_datasets_analysis()
|
| 88 |
|
| 89 |
-
def load_mmlu(language_bcp_47):
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter, defaultdict
|
| 2 |
+
import random
|
| 3 |
+
from datasets import get_dataset_config_names, load_dataset
|
| 4 |
from joblib.memory import Memory
|
| 5 |
+
from langcodes import Language, standardize_tag
|
| 6 |
from rich import print
|
| 7 |
+
|
|
|
|
| 8 |
cache = Memory(location=".cache", verbose=0).cache
|
| 9 |
|
| 10 |
+
|
| 11 |
@cache
|
| 12 |
def _get_dataset_config_names(dataset):
|
| 13 |
return get_dataset_config_names(dataset)
|
| 14 |
|
| 15 |
+
|
| 16 |
@cache
|
| 17 |
def _load_dataset(dataset, subset, **kwargs):
|
| 18 |
return load_dataset(dataset, subset, **kwargs)
|
| 19 |
|
| 20 |
+
|
| 21 |
+
def print_counts(slug, subjects_dev, subjects_test):
|
| 22 |
+
print(
|
| 23 |
+
f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
|
| 27 |
def print_datasets_analysis():
|
| 28 |
print("Category counts and sample counts per dataset:")
|
|
|
|
| 32 |
langs1 = _get_dataset_config_names(slug1)
|
| 33 |
langs1 = [standardize_tag(a, macro=True) for a in langs1]
|
| 34 |
|
| 35 |
+
slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
|
| 36 |
ds2 = _load_dataset(slug2, "FR_FR")
|
| 37 |
print_counts(slug2, [], ds2["test"]["Subject"])
|
| 38 |
langs2 = _get_dataset_config_names(slug2)
|
|
|
|
| 47 |
|
| 48 |
slug4 = "lighteval/okapi_mmlu"
|
| 49 |
ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
|
| 50 |
+
print_counts(
|
| 51 |
+
slug4,
|
| 52 |
+
[a.split("/")[0] for a in ds4["dev"]["id"]],
|
| 53 |
+
[a.split("/")[0] for a in ds4["test"]["id"]],
|
| 54 |
+
)
|
| 55 |
langs4 = _get_dataset_config_names(slug4)
|
| 56 |
|
|
|
|
| 57 |
slug5 = "Eurolingua/mmlux"
|
| 58 |
subsets = _get_dataset_config_names(slug5)
|
| 59 |
subjects = set(a.rsplit("_", 1)[0] for a in subsets)
|
| 60 |
+
rows_test = [
|
| 61 |
+
_load_dataset(slug5, subset)["test"]["id"]
|
| 62 |
+
for subset in subsets
|
| 63 |
+
if "_DA" in subset
|
| 64 |
+
]
|
| 65 |
rows_test = [a.split("/")[0] for l in rows_test for a in l]
|
| 66 |
+
rows_dev = [
|
| 67 |
+
_load_dataset(slug5, subset)["dev"]["id"]
|
| 68 |
+
for subset in subsets
|
| 69 |
+
if "_DA" in subset
|
| 70 |
+
]
|
| 71 |
rows_dev = [a.split("/")[0] for l in rows_dev for a in l]
|
| 72 |
print_counts(slug5, rows_dev, rows_test)
|
| 73 |
langs5 = list(set(a.rsplit("_", 1)[1].split("-")[0].lower() for a in subsets))
|
|
|
|
| 89 |
print(len(set(langs)))
|
| 90 |
|
| 91 |
print("Datasets per language for languages that are not in Global-MMLU:")
|
| 92 |
+
print(
|
| 93 |
+
sorted(
|
| 94 |
+
(lang, datasets)
|
| 95 |
+
for lang, datasets in lang_datasets.items()
|
| 96 |
+
if slug3 not in datasets
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
print(
|
| 100 |
+
Counter(
|
| 101 |
+
dataset
|
| 102 |
+
for ds_list in lang_datasets.values()
|
| 103 |
+
for dataset in ds_list
|
| 104 |
+
if slug3 not in ds_list
|
| 105 |
+
)
|
| 106 |
+
)
|
| 107 |
print(list(set(ds1["test"]["subject"])))
|
| 108 |
|
| 109 |
+
|
| 110 |
# based on this analysis:
|
| 111 |
# - we drop the OpenAI dataset, since it does not have a dev set, and since every language that it has is also present in Global-MMLU
|
| 112 |
# - we stick to the 5 categories of the AfriMMLU dataset, since this is the most restricted dataset, and these 5 categories are present in all datasets, so this is good for comparability
|
| 113 |
|
| 114 |
# AfriMMLU is human-translated, but has only 5 task categories
|
| 115 |
+
# Global-MMLU is mixed-translated, specifically those 15 languages are that are also present in Global-MMLU-Lite, which are mostly from MMMLU; otherwise translated using Google Translate
|
| 116 |
# Okapi-MMLU is translated using ChatGPT (version unclear)
|
| 117 |
# MMLUX is translated using DeepL
|
| 118 |
+
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
| 119 |
|
| 120 |
+
# print_datasets_analysis()
|
| 121 |
|
| 122 |
+
def load_mmlu(language_bcp_47, i):
|
| 123 |
+
categories = sorted(list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"])))
|
| 124 |
+
category = categories[i % len(categories)]
|
| 125 |
+
random.seed(i)
|
| 126 |
+
j = random.randint(0, 100)
|
| 127 |
+
print(j)
|
| 128 |
+
tags_afrimmlu = {
|
| 129 |
+
standardize_tag(a, macro=True): a
|
| 130 |
+
for a in _get_dataset_config_names("masakhane/afrimmlu")
|
| 131 |
+
}
|
| 132 |
+
tags_global_mmlu = {
|
| 133 |
+
standardize_tag(a, macro=True): a
|
| 134 |
+
for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
|
| 135 |
+
}
|
| 136 |
+
tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
|
| 137 |
+
tags_mmlux = set(
|
| 138 |
+
a.rsplit("_", 1)[1].split("-")[0].lower()
|
| 139 |
+
for a in _get_dataset_config_names("Eurolingua/mmlux")
|
| 140 |
+
)
|
| 141 |
+
if language_bcp_47 in tags_afrimmlu:
|
| 142 |
+
ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
|
| 143 |
+
return ds["test"].filter(lambda x: x["subject"] == category)[j]
|
| 144 |
+
elif language_bcp_47 in tags_global_mmlu:
|
| 145 |
+
ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
|
| 146 |
+
def add_choices(split):
|
| 147 |
+
split["choices"] = list(zip([split["option_a"], split["option_b"], split["option_c"], split["option_d"]]))
|
| 148 |
+
return split
|
| 149 |
+
ds = ds.map(add_choices)
|
| 150 |
+
return ds["test"].filter(lambda x: x["subject"] == category)[j]
|
| 151 |
+
elif language_bcp_47 in tags_okapi:
|
| 152 |
+
ds = _load_dataset(
|
| 153 |
+
"lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
|
| 154 |
+
)
|
| 155 |
+
return ds["test"].filter(lambda x: x["id"] == f"{category}/test/{j}")[0]
|
| 156 |
+
elif language_bcp_47 in tags_mmlux:
|
| 157 |
+
# loading this is more complicated, todo
|
| 158 |
+
return None
|
| 159 |
+
else:
|
| 160 |
+
return None
|
evals/tasks.py
CHANGED
|
@@ -8,7 +8,7 @@ from datasets_.flores import flores_sentences
|
|
| 8 |
from joblib.memory import Memory
|
| 9 |
from languages import languages, script_name
|
| 10 |
from models import complete, transcribe
|
| 11 |
-
from datasets import load_dataset
|
| 12 |
|
| 13 |
cache = Memory(location=".cache", verbose=0).cache
|
| 14 |
bleu = evaluate.load("bleu")
|
|
@@ -186,13 +186,10 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
| 186 |
}
|
| 187 |
]
|
| 188 |
|
| 189 |
-
|
| 190 |
-
def _load_dataset(dataset, subset):
|
| 191 |
-
return load_dataset(dataset, subset)
|
| 192 |
|
| 193 |
@cache
|
| 194 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 195 |
-
data = _load_dataset("CohereForAI/Global-MMLU", language_bcp_47)
|
| 196 |
item = data["test"][nr]
|
| 197 |
def format_item(item):
|
| 198 |
return f"""{item['question']}
|
|
@@ -220,12 +217,18 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 220 |
"model": model,
|
| 221 |
"bcp_47": language_bcp_47,
|
| 222 |
"task": "mmlu",
|
|
|
|
| 223 |
"metric": "accuracy",
|
| 224 |
"score": acc,
|
| 225 |
"sentence_nr": nr,
|
| 226 |
}
|
| 227 |
]
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
@cache
|
| 230 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 231 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
|
|
|
| 8 |
from joblib.memory import Memory
|
| 9 |
from languages import languages, script_name
|
| 10 |
from models import complete, transcribe
|
| 11 |
+
from datasets import load_dataset, get_dataset_config_names
|
| 12 |
|
| 13 |
cache = Memory(location=".cache", verbose=0).cache
|
| 14 |
bleu = evaluate.load("bleu")
|
|
|
|
| 186 |
}
|
| 187 |
]
|
| 188 |
|
| 189 |
+
|
|
|
|
|
|
|
| 190 |
|
| 191 |
@cache
|
| 192 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
|
|
| 193 |
item = data["test"][nr]
|
| 194 |
def format_item(item):
|
| 195 |
return f"""{item['question']}
|
|
|
|
| 217 |
"model": model,
|
| 218 |
"bcp_47": language_bcp_47,
|
| 219 |
"task": "mmlu",
|
| 220 |
+
"dataset": ds,
|
| 221 |
"metric": "accuracy",
|
| 222 |
"score": acc,
|
| 223 |
"sentence_nr": nr,
|
| 224 |
}
|
| 225 |
]
|
| 226 |
|
| 227 |
+
from asyncio import run
|
| 228 |
+
results = run(mmlu_and_evaluate("gpt-4o-mini", "fr", 0))
|
| 229 |
+
print(results)
|
| 230 |
+
exit()
|
| 231 |
+
|
| 232 |
@cache
|
| 233 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 234 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
uv.lock
CHANGED
|
@@ -898,7 +898,7 @@ dev = [
|
|
| 898 |
{ name = "openai", specifier = ">=1.52.2" },
|
| 899 |
{ name = "protobuf", specifier = ">=5.28.3" },
|
| 900 |
{ name = "python-dotenv", specifier = ">=1.0.1" },
|
| 901 |
-
{ name = "rich" },
|
| 902 |
{ name = "sacrebleu", specifier = ">=2.4.3" },
|
| 903 |
{ name = "sentencepiece", specifier = ">=0.2.0" },
|
| 904 |
{ name = "tiktoken", specifier = ">=0.8.0" },
|
|
|
|
| 898 |
{ name = "openai", specifier = ">=1.52.2" },
|
| 899 |
{ name = "protobuf", specifier = ">=5.28.3" },
|
| 900 |
{ name = "python-dotenv", specifier = ">=1.0.1" },
|
| 901 |
+
{ name = "rich", specifier = ">=14.0.0" },
|
| 902 |
{ name = "sacrebleu", specifier = ">=2.4.3" },
|
| 903 |
{ name = "sentencepiece", specifier = ">=0.2.0" },
|
| 904 |
{ name = "tiktoken", specifier = ">=0.8.0" },
|