Spaces:

fair-forward
/

evals-for-every-language

Running

davidpomerenke commited on Aug 5

Commit

e51c770

verified ·

1 Parent(s): bbb82e8

Upload from GitHub Actions: updated workflow settings

Files changed (3) hide show

.github/workflows/nightly-evals.yml CHANGED Viewed

@@ -25,6 +25,8 @@ jobs:
         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py

         env:
           OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
           HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
+          N_SENTENCES: 20
+          MAX_LANGUAGES: 150
         run: |
           uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
           uv run evals/download_data.py

evals/README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 # Evaluation Framework Documentation
-This document outlines the current methodology used for evaluating multilingual language models in this project. We may The framework is designed to be fair, consistent, and robust, providing a standardized way to measure model performance across a diverse set of languages and tasks.
-## Core Philosophy: English Zero-Shot Prompting
-The core of our evaluation methodology is a **unified English zero-shot prompting strategy**. This means:
 1.  **Instructions are in English**: All models receive their instructions in clear, standardized English. This removes the quality of prompt translation as a variable, ensuring a fair comparison.
 2.  **Content is in the Target Language**: The actual content to be evaluated (e.g., a question for a QA task, a sentence for translation) is always presented in the target language. This directly tests the model's ability to understand instructions in one language and apply them to content in another.

 # Evaluation Framework Documentation
+This document outlines the current methodology used for evaluating multilingual language models in this project. We may update the methodology in the future. The main objective was to have something that is unified and comparable and straightforward to build upon. The framework is designed to be fair, consistent, and robust, providing a standardized way to measure model performance across a diverse set of languages and tasks.
+## Current Approach: English Zero-Shot Prompting
+The current working base of our evaluation methodology is a **unified English zero-shot prompting strategy**. This means:
 1.  **Instructions are in English**: All models receive their instructions in clear, standardized English. This removes the quality of prompt translation as a variable, ensuring a fair comparison.
 2.  **Content is in the Target Language**: The actual content to be evaluated (e.g., a question for a QA task, a sentence for translation) is always presented in the target language. This directly tests the model's ability to understand instructions in one language and apply them to content in another.

evals/models.py CHANGED Viewed

@@ -27,9 +27,9 @@ important_models = [
     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
-    # "openai/gpt-4o-2024-11-20", # 10$
     "openai/gpt-3.5-turbo-0613",  # 2$
-    # "openai/gpt-3.5-turbo",  # 1.5$
     # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "mistralai/mistral-saba",  # 0.6$
@@ -60,6 +60,7 @@ blocklist = [
     "google/gemini-2.5-flash-lite-preview-06-17",
     "google/gemini-2.5-pro-preview-06-05",
     "google/gemini-2.5-pro-preview-05-06",
 ]
 transcription_models = [

     "openai/gpt-4.1-mini",  # 1.6$
     "openai/gpt-4.1-nano",  # 0.4$
     "openai/gpt-4o-mini",  # 0.6$
+    "openai/gpt-4o-2024-11-20", # 10$
     "openai/gpt-3.5-turbo-0613",  # 2$
+    "openai/gpt-3.5-turbo",  # 1.5$
     # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
     "mistralai/mistral-small-3.1-24b-instruct",  # 0.3$
     "mistralai/mistral-saba",  # 0.6$
     "google/gemini-2.5-flash-lite-preview-06-17",
     "google/gemini-2.5-pro-preview-06-05",
     "google/gemini-2.5-pro-preview-05-06",
+    "perplexity/sonar-deep-research"
 ]
 transcription_models = [