Upload from GitHub Actions: updated workflow settings
Browse files- .github/workflows/nightly-evals.yml +2 -0
- evals/README.md +3 -3
- evals/models.py +3 -2
.github/workflows/nightly-evals.yml
CHANGED
@@ -25,6 +25,8 @@ jobs:
|
|
25 |
env:
|
26 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
27 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
|
|
|
28 |
run: |
|
29 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
30 |
uv run evals/download_data.py
|
|
|
25 |
env:
|
26 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
27 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
28 |
+
N_SENTENCES: 20
|
29 |
+
MAX_LANGUAGES: 150
|
30 |
run: |
|
31 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
32 |
uv run evals/download_data.py
|
evals/README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
# Evaluation Framework Documentation
|
2 |
|
3 |
-
This document outlines the current methodology used for evaluating multilingual language models in this project. We may The framework is designed to be fair, consistent, and robust, providing a standardized way to measure model performance across a diverse set of languages and tasks.
|
4 |
|
5 |
-
##
|
6 |
|
7 |
-
The
|
8 |
|
9 |
1. **Instructions are in English**: All models receive their instructions in clear, standardized English. This removes the quality of prompt translation as a variable, ensuring a fair comparison.
|
10 |
2. **Content is in the Target Language**: The actual content to be evaluated (e.g., a question for a QA task, a sentence for translation) is always presented in the target language. This directly tests the model's ability to understand instructions in one language and apply them to content in another.
|
|
|
1 |
# Evaluation Framework Documentation
|
2 |
|
3 |
+
This document outlines the current methodology used for evaluating multilingual language models in this project. We may update the methodology in the future. The main objective was to have something that is unified and comparable and straightforward to build upon. The framework is designed to be fair, consistent, and robust, providing a standardized way to measure model performance across a diverse set of languages and tasks.
|
4 |
|
5 |
+
## Current Approach: English Zero-Shot Prompting
|
6 |
|
7 |
+
The current working base of our evaluation methodology is a **unified English zero-shot prompting strategy**. This means:
|
8 |
|
9 |
1. **Instructions are in English**: All models receive their instructions in clear, standardized English. This removes the quality of prompt translation as a variable, ensuring a fair comparison.
|
10 |
2. **Content is in the Target Language**: The actual content to be evaluated (e.g., a question for a QA task, a sentence for translation) is always presented in the target language. This directly tests the model's ability to understand instructions in one language and apply them to content in another.
|
evals/models.py
CHANGED
@@ -27,9 +27,9 @@ important_models = [
|
|
27 |
"openai/gpt-4.1-mini", # 1.6$
|
28 |
"openai/gpt-4.1-nano", # 0.4$
|
29 |
"openai/gpt-4o-mini", # 0.6$
|
30 |
-
|
31 |
"openai/gpt-3.5-turbo-0613", # 2$
|
32 |
-
|
33 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
34 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
35 |
"mistralai/mistral-saba", # 0.6$
|
@@ -60,6 +60,7 @@ blocklist = [
|
|
60 |
"google/gemini-2.5-flash-lite-preview-06-17",
|
61 |
"google/gemini-2.5-pro-preview-06-05",
|
62 |
"google/gemini-2.5-pro-preview-05-06",
|
|
|
63 |
]
|
64 |
|
65 |
transcription_models = [
|
|
|
27 |
"openai/gpt-4.1-mini", # 1.6$
|
28 |
"openai/gpt-4.1-nano", # 0.4$
|
29 |
"openai/gpt-4o-mini", # 0.6$
|
30 |
+
"openai/gpt-4o-2024-11-20", # 10$
|
31 |
"openai/gpt-3.5-turbo-0613", # 2$
|
32 |
+
"openai/gpt-3.5-turbo", # 1.5$
|
33 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
34 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
35 |
"mistralai/mistral-saba", # 0.6$
|
|
|
60 |
"google/gemini-2.5-flash-lite-preview-06-17",
|
61 |
"google/gemini-2.5-pro-preview-06-05",
|
62 |
"google/gemini-2.5-pro-preview-05-06",
|
63 |
+
"perplexity/sonar-deep-research"
|
64 |
]
|
65 |
|
66 |
transcription_models = [
|