davidpomerenke commited on
Commit
e51c770
·
verified ·
1 Parent(s): bbb82e8

Upload from GitHub Actions: updated workflow settings

Browse files
.github/workflows/nightly-evals.yml CHANGED
@@ -25,6 +25,8 @@ jobs:
25
  env:
26
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
27
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
 
 
28
  run: |
29
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
30
  uv run evals/download_data.py
 
25
  env:
26
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
27
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
28
+ N_SENTENCES: 20
29
+ MAX_LANGUAGES: 150
30
  run: |
31
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
32
  uv run evals/download_data.py
evals/README.md CHANGED
@@ -1,10 +1,10 @@
1
  # Evaluation Framework Documentation
2
 
3
- This document outlines the current methodology used for evaluating multilingual language models in this project. We may The framework is designed to be fair, consistent, and robust, providing a standardized way to measure model performance across a diverse set of languages and tasks.
4
 
5
- ## Core Philosophy: English Zero-Shot Prompting
6
 
7
- The core of our evaluation methodology is a **unified English zero-shot prompting strategy**. This means:
8
 
9
  1. **Instructions are in English**: All models receive their instructions in clear, standardized English. This removes the quality of prompt translation as a variable, ensuring a fair comparison.
10
  2. **Content is in the Target Language**: The actual content to be evaluated (e.g., a question for a QA task, a sentence for translation) is always presented in the target language. This directly tests the model's ability to understand instructions in one language and apply them to content in another.
 
1
  # Evaluation Framework Documentation
2
 
3
+ This document outlines the current methodology used for evaluating multilingual language models in this project. We may update the methodology in the future. The main objective was to have something that is unified and comparable and straightforward to build upon. The framework is designed to be fair, consistent, and robust, providing a standardized way to measure model performance across a diverse set of languages and tasks.
4
 
5
+ ## Current Approach: English Zero-Shot Prompting
6
 
7
+ The current working base of our evaluation methodology is a **unified English zero-shot prompting strategy**. This means:
8
 
9
  1. **Instructions are in English**: All models receive their instructions in clear, standardized English. This removes the quality of prompt translation as a variable, ensuring a fair comparison.
10
  2. **Content is in the Target Language**: The actual content to be evaluated (e.g., a question for a QA task, a sentence for translation) is always presented in the target language. This directly tests the model's ability to understand instructions in one language and apply them to content in another.
evals/models.py CHANGED
@@ -27,9 +27,9 @@ important_models = [
27
  "openai/gpt-4.1-mini", # 1.6$
28
  "openai/gpt-4.1-nano", # 0.4$
29
  "openai/gpt-4o-mini", # 0.6$
30
- # "openai/gpt-4o-2024-11-20", # 10$
31
  "openai/gpt-3.5-turbo-0613", # 2$
32
- # "openai/gpt-3.5-turbo", # 1.5$
33
  # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
34
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
35
  "mistralai/mistral-saba", # 0.6$
@@ -60,6 +60,7 @@ blocklist = [
60
  "google/gemini-2.5-flash-lite-preview-06-17",
61
  "google/gemini-2.5-pro-preview-06-05",
62
  "google/gemini-2.5-pro-preview-05-06",
 
63
  ]
64
 
65
  transcription_models = [
 
27
  "openai/gpt-4.1-mini", # 1.6$
28
  "openai/gpt-4.1-nano", # 0.4$
29
  "openai/gpt-4o-mini", # 0.6$
30
+ "openai/gpt-4o-2024-11-20", # 10$
31
  "openai/gpt-3.5-turbo-0613", # 2$
32
+ "openai/gpt-3.5-turbo", # 1.5$
33
  # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
34
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
35
  "mistralai/mistral-saba", # 0.6$
 
60
  "google/gemini-2.5-flash-lite-preview-06-17",
61
  "google/gemini-2.5-pro-preview-06-05",
62
  "google/gemini-2.5-pro-preview-05-06",
63
+ "perplexity/sonar-deep-research"
64
  ]
65
 
66
  transcription_models = [