Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
260c1a3
1
Parent(s):
3680a5f
Run on 40 languages, additional models
Browse files- evals/datasets_/mmlu.py +1 -0
- evals/main.py +3 -2
- evals/models.py +13 -9
- evals/tasks.py +13 -7
- frontend/src/components/SpeakerPlot.js +3 -3
- results.json +0 -0
evals/datasets_/mmlu.py
CHANGED
|
@@ -156,6 +156,7 @@ def load_mmlu(language_bcp_47, nr):
|
|
| 156 |
task = ds["test"].filter(lambda x: x["subject"] == category)[i]
|
| 157 |
return "CohereForAI/Global-MMLU", examples, task
|
| 158 |
elif language_bcp_47 in tags_okapi:
|
|
|
|
| 159 |
ds = _load_dataset(
|
| 160 |
"lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
|
| 161 |
)
|
|
|
|
| 156 |
task = ds["test"].filter(lambda x: x["subject"] == category)[i]
|
| 157 |
return "CohereForAI/Global-MMLU", examples, task
|
| 158 |
elif language_bcp_47 in tags_okapi:
|
| 159 |
+
return None, None, None # FIXME
|
| 160 |
ds = _load_dataset(
|
| 161 |
"lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
|
| 162 |
)
|
evals/main.py
CHANGED
|
@@ -12,8 +12,8 @@ from tasks import tasks
|
|
| 12 |
# ===== config =====
|
| 13 |
|
| 14 |
n_sentences = 10
|
| 15 |
-
n_languages =
|
| 16 |
-
n_models =
|
| 17 |
|
| 18 |
# ===== run evaluation and aggregate results =====
|
| 19 |
|
|
@@ -26,6 +26,7 @@ async def evaluate():
|
|
| 26 |
for i in range(n_sentences)
|
| 27 |
for lang in languages.iloc[:n_languages].itertuples()
|
| 28 |
for model in models["id"].iloc[:n_models]
|
|
|
|
| 29 |
]
|
| 30 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
| 31 |
|
|
|
|
| 12 |
# ===== config =====
|
| 13 |
|
| 14 |
n_sentences = 10
|
| 15 |
+
n_languages = 40
|
| 16 |
+
n_models = 25
|
| 17 |
|
| 18 |
# ===== run evaluation and aggregate results =====
|
| 19 |
|
|
|
|
| 26 |
for i in range(n_sentences)
|
| 27 |
for lang in languages.iloc[:n_languages].itertuples()
|
| 28 |
for model in models["id"].iloc[:n_models]
|
| 29 |
+
if lang.in_benchmark # TODO
|
| 30 |
]
|
| 31 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
| 32 |
|
evals/models.py
CHANGED
|
@@ -20,22 +20,26 @@ models = [
|
|
| 20 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$
|
| 21 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 22 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 23 |
-
# "meta-llama/llama-2-70b-chat", # 0.9$; not
|
|
|
|
| 24 |
"openai/gpt-4.1-nano", # 0.4$
|
| 25 |
"openai/gpt-4o-mini", # 0.6$
|
| 26 |
-
|
| 27 |
-
|
| 28 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
| 29 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
| 30 |
-
|
| 31 |
-
|
| 32 |
"google/gemini-2.5-flash-preview", # 0.6$
|
| 33 |
-
|
| 34 |
"google/gemma-3-27b-it", # 0.2$
|
| 35 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
| 36 |
-
"qwen/qwq-32b", # 0.2$
|
|
|
|
|
|
|
| 37 |
"deepseek/deepseek-chat-v3-0324", # 1.1$
|
| 38 |
-
|
|
|
|
| 39 |
"microsoft/phi-4-multimodal-instruct", # 0.1$
|
| 40 |
"amazon/nova-micro-v1", # 0.09$
|
| 41 |
]
|
|
@@ -152,7 +156,7 @@ def get_hf_metadata(row):
|
|
| 152 |
return empty
|
| 153 |
try:
|
| 154 |
info = api.model_info(id)
|
| 155 |
-
license = info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
|
| 156 |
return {
|
| 157 |
"hf_id": info.id,
|
| 158 |
"creation_date": info.created_at,
|
|
|
|
| 20 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$
|
| 21 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 22 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 23 |
+
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
| 24 |
+
"openai/gpt-4.1-mini", # 1.6$
|
| 25 |
"openai/gpt-4.1-nano", # 0.4$
|
| 26 |
"openai/gpt-4o-mini", # 0.6$
|
| 27 |
+
"openai/gpt-3.5-turbo-0613", # 2$
|
| 28 |
+
"openai/gpt-3.5-turbo", # 1.5$
|
| 29 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
| 30 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
| 31 |
+
"mistralai/mistral-saba", # 0.6$
|
| 32 |
+
"mistralai/mistral-nemo", # 0.08$
|
| 33 |
"google/gemini-2.5-flash-preview", # 0.6$
|
| 34 |
+
"google/gemini-2.0-flash-lite-001", # 0.3$
|
| 35 |
"google/gemma-3-27b-it", # 0.2$
|
| 36 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
| 37 |
+
# "qwen/qwq-32b", # 0.2$
|
| 38 |
+
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
| 39 |
+
# "qwen/qwen-2-72b-instruct", # 0.9$
|
| 40 |
"deepseek/deepseek-chat-v3-0324", # 1.1$
|
| 41 |
+
"deepseek/deepseek-chat", # 0.89$
|
| 42 |
+
"microsoft/phi-4", # 0.07$
|
| 43 |
"microsoft/phi-4-multimodal-instruct", # 0.1$
|
| 44 |
"amazon/nova-micro-v1", # 0.09$
|
| 45 |
]
|
|
|
|
| 156 |
return empty
|
| 157 |
try:
|
| 158 |
info = api.model_info(id)
|
| 159 |
+
license = (info.card_data.license or "").replace("-", " ").replace("mit", "MIT").title()
|
| 160 |
return {
|
| 161 |
"hf_id": info.id,
|
| 162 |
"creation_date": info.created_at,
|
evals/tasks.py
CHANGED
|
@@ -221,13 +221,19 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 221 |
{"role": "assistant", "content": example["answer"]},
|
| 222 |
]
|
| 223 |
messages += [{"role": "user", "content": format_item(task)}]
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
return [
|
| 232 |
{
|
| 233 |
"model": model,
|
|
|
|
| 221 |
{"role": "assistant", "content": example["answer"]},
|
| 222 |
]
|
| 223 |
messages += [{"role": "user", "content": format_item(task)}]
|
| 224 |
+
try:
|
| 225 |
+
reply = await complete(
|
| 226 |
+
model=model,
|
| 227 |
+
messages=messages,
|
| 228 |
+
temperature=0,
|
| 229 |
+
max_tokens=1,
|
| 230 |
+
)
|
| 231 |
+
acc = int(reply.choices[0].message.content[:1].strip() == task["answer"])
|
| 232 |
+
except Exception as e:
|
| 233 |
+
if "ResponsibleAIPolicyViolation" in str(e):
|
| 234 |
+
acc = 0
|
| 235 |
+
else:
|
| 236 |
+
raise e
|
| 237 |
return [
|
| 238 |
{
|
| 239 |
"model": model,
|
frontend/src/components/SpeakerPlot.js
CHANGED
|
@@ -73,9 +73,9 @@ const SpeakerPlot = ({ data }) => {
|
|
| 73 |
textStrokeOpacity: 0,
|
| 74 |
textFillOpacity: 0
|
| 75 |
}),
|
| 76 |
-
Plot.tip(['The
|
| 77 |
-
x:
|
| 78 |
-
y: languages[
|
| 79 |
})
|
| 80 |
]
|
| 81 |
})
|
|
|
|
| 73 |
textStrokeOpacity: 0,
|
| 74 |
textFillOpacity: 0
|
| 75 |
}),
|
| 76 |
+
Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
|
| 77 |
+
x: 40,
|
| 78 |
+
y: languages[39].cumSpeakers / 1e6
|
| 79 |
})
|
| 80 |
]
|
| 81 |
})
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|