Spaces:

fair-forward
/

languagebench

Running

David Pomerenke commited on Apr 27

Commit

260c1a3

1 Parent(s): 3680a5f

Run on 40 languages, additional models

Files changed (6) hide show

evals/datasets_/mmlu.py CHANGED Viewed

@@ -156,6 +156,7 @@ def load_mmlu(language_bcp_47, nr):
         task = ds["test"].filter(lambda x: x["subject"] == category)[i]
         return "CohereForAI/Global-MMLU", examples, task
     elif language_bcp_47 in tags_okapi:
         ds = _load_dataset(
             "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
         )

         task = ds["test"].filter(lambda x: x["subject"] == category)[i]
         return "CohereForAI/Global-MMLU", examples, task
     elif language_bcp_47 in tags_okapi:
+        return None, None, None # FIXME
         ds = _load_dataset(
             "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
         )

evals/main.py CHANGED Viewed

@@ -12,8 +12,8 @@ from tasks import tasks
 # ===== config =====
 n_sentences = 10
-n_languages = 15
-n_models = 20
 # ===== run evaluation and aggregate results =====
@@ -26,6 +26,7 @@ async def evaluate():
         for i in range(n_sentences)
         for lang in languages.iloc[:n_languages].itertuples()
         for model in models["id"].iloc[:n_models]
     ]
     return await tqdm_asyncio.gather(*results, miniters=1)

 # ===== config =====
 n_sentences = 10
+n_languages = 40
+n_models = 25
 # ===== run evaluation and aggregate results =====
         for i in range(n_sentences)
         for lang in languages.iloc[:n_languages].itertuples()
         for model in models["id"].iloc[:n_models]
+        if lang.in_benchmark # TODO
     ]
     return await tqdm_asyncio.gather(*results, miniters=1)

evals/models.py CHANGED Viewed

@@ -20,22 +20,26 @@ models = [
     "meta-llama/llama-3.3-70b-instruct",  # 0.3$
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
-    # "meta-llama/llama-2-70b-chat", # 0.9$; not enough context
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
-    # "openai/gpt-3.5-turbo-0613",  # 2$
-    # "openai/gpt-3.5-turbo",  # 1.5$
     # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
-    # "mistralai/mistral-saba", # 0.6$
-    # "mistralai/mistral-nemo", # 0.08$
     "google/gemini-2.5-flash-preview",  # 0.6$
-    # "google/gemini-2.0-flash-lite-001",  # 0.3$
     "google/gemma-3-27b-it",  # 0.2$
     # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
-    "qwen/qwq-32b",  # 0.2$
     "deepseek/deepseek-chat-v3-0324",  # 1.1$
-    # "microsoft/phi-4",  # 0.07$; only 16k tokens context
     "microsoft/phi-4-multimodal-instruct",  # 0.1$
     "amazon/nova-micro-v1",  # 0.09$
 ]
@@ -152,7 +156,7 @@ def get_hf_metadata(row):
         return empty
     try:
         info = api.model_info(id)
-        license = info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
         return {
             "hf_id": info.id,
             "creation_date": info.created_at,

     "meta-llama/llama-3.3-70b-instruct",  # 0.3$
     "meta-llama/llama-3.1-70b-instruct",  # 0.3$
     "meta-llama/llama-3-70b-instruct",  # 0.4$
+    # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
+    "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
+    "openai/gpt-3.5-turbo-0613",  # 2$
+    "openai/gpt-3.5-turbo",  # 1.5$
     # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
+    "mistralai/mistral-saba", # 0.6$
+    "mistralai/mistral-nemo", # 0.08$
     "google/gemini-2.5-flash-preview",  # 0.6$
+    "google/gemini-2.0-flash-lite-001",  # 0.3$
     "google/gemma-3-27b-it",  # 0.2$
     # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
+    # "qwen/qwq-32b",  # 0.2$
+    # "qwen/qwen-2.5-72b-instruct",  # 0.39$
+    # "qwen/qwen-2-72b-instruct",  # 0.9$
     "deepseek/deepseek-chat-v3-0324",  # 1.1$
+    "deepseek/deepseek-chat", # 0.89$
+    "microsoft/phi-4",  # 0.07$
     "microsoft/phi-4-multimodal-instruct",  # 0.1$
     "amazon/nova-micro-v1",  # 0.09$
 ]
         return empty
     try:
         info = api.model_info(id)
+        license = (info.card_data.license or "").replace("-", " ").replace("mit", "MIT").title()
         return {
             "hf_id": info.id,
             "creation_date": info.created_at,

evals/tasks.py CHANGED Viewed

@@ -221,13 +221,19 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
             {"role": "assistant", "content": example["answer"]},
         ]
     messages += [{"role": "user", "content": format_item(task)}]
-    reply = await complete(
-        model=model,
-        messages=messages,
-        temperature=0,
-        max_tokens=1,
-    )
-    acc = int(reply.choices[0].message.content[:1].strip() == task["answer"])
     return [
         {
             "model": model,

             {"role": "assistant", "content": example["answer"]},
         ]
     messages += [{"role": "user", "content": format_item(task)}]
+    try:
+        reply = await complete(
+            model=model,
+            messages=messages,
+            temperature=0,
+            max_tokens=1,
+        )
+        acc = int(reply.choices[0].message.content[:1].strip() == task["answer"])
+    except Exception as e:
+        if "ResponsibleAIPolicyViolation" in str(e):
+            acc = 0
+        else:
+            raise e
     return [
         {
             "model": model,

frontend/src/components/SpeakerPlot.js CHANGED Viewed

@@ -73,9 +73,9 @@ const SpeakerPlot = ({ data }) => {
           textStrokeOpacity: 0,
           textFillOpacity: 0
         }),
-        Plot.tip(['The 41 most spoken languages cover 80% of all speakers.'], {
-          x: 41,
-          y: languages[40].cumSpeakers / 1e6
         })
       ]
     })

           textStrokeOpacity: 0,
           textFillOpacity: 0
         }),
+        Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
+          x: 40,
+          y: languages[39].cumSpeakers / 1e6
         })
       ]
     })

results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff