Upload from GitHub Actions: Merge pull request #10 from datenlabor-bmz/jn-dev
Browse files- .github/workflows/nightly-evals.yml +24 -2
- evals/main.py +119 -12
.github/workflows/nightly-evals.yml
CHANGED
@@ -8,6 +8,7 @@ on:
|
|
8 |
jobs:
|
9 |
run-evals:
|
10 |
runs-on: ubuntu-latest
|
|
|
11 |
steps:
|
12 |
- uses: actions/checkout@v3
|
13 |
|
@@ -21,7 +22,7 @@ jobs:
|
|
21 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
22 |
uv sync --frozen --extra dev
|
23 |
|
24 |
-
- name: Run evaluations
|
25 |
env:
|
26 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
27 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
@@ -30,7 +31,28 @@ jobs:
|
|
30 |
run: |
|
31 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
32 |
uv run evals/download_data.py
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
- name: Commit changes
|
36 |
env:
|
|
|
8 |
jobs:
|
9 |
run-evals:
|
10 |
runs-on: ubuntu-latest
|
11 |
+
timeout-minutes: 1440 # 24 hours timeout
|
12 |
steps:
|
13 |
- uses: actions/checkout@v3
|
14 |
|
|
|
22 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
23 |
uv sync --frozen --extra dev
|
24 |
|
25 |
+
- name: Run evaluations with checkpointing
|
26 |
env:
|
27 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
28 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
|
31 |
run: |
|
32 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
33 |
uv run evals/download_data.py
|
34 |
+
|
35 |
+
# Run evaluations with periodic checkpointing
|
36 |
+
uv run python -c "
|
37 |
+
import time
|
38 |
+
import subprocess
|
39 |
+
import json
|
40 |
+
import os
|
41 |
+
|
42 |
+
# Check if we have existing results to resume from
|
43 |
+
if os.path.exists('results.json'):
|
44 |
+
print('Found existing results.json, will resume from checkpoint')
|
45 |
+
|
46 |
+
# Run the main evaluation
|
47 |
+
try:
|
48 |
+
subprocess.run(['uv', 'run', 'evals/main.py'], check=True)
|
49 |
+
except subprocess.CalledProcessError as e:
|
50 |
+
print(f'Evaluation failed: {e}')
|
51 |
+
# Save current state even if failed
|
52 |
+
if os.path.exists('results.json'):
|
53 |
+
print('Saving checkpoint before exit...')
|
54 |
+
exit(1)
|
55 |
+
"
|
56 |
|
57 |
- name: Commit changes
|
58 |
env:
|
evals/main.py
CHANGED
@@ -11,6 +11,45 @@ import json
|
|
11 |
|
12 |
results = pd.DataFrame()
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
async def evaluate():
|
15 |
# FIXME we should not need this for-loop, but it helps
|
16 |
n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
|
@@ -29,13 +68,30 @@ async def evaluate():
|
|
29 |
top_languages = languages.head(max_languages) # Top N by population
|
30 |
print(f"π Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
# For testing, just use all available languages up to max_languages
|
33 |
for n_languages in [min(max_languages, len(top_languages))]:
|
34 |
print(f"running evaluations for {n_languages} languages")
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
37 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
39 |
# get all combinations of model, language and task
|
40 |
combis = [
|
41 |
(model, lang.bcp_47, task_name)
|
@@ -60,9 +116,14 @@ async def evaluate():
|
|
60 |
batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
|
61 |
all_results = []
|
62 |
|
63 |
-
|
|
|
|
|
|
|
64 |
batch = all_tasks[i:i+batch_size]
|
65 |
-
|
|
|
|
|
66 |
|
67 |
# Show what's being evaluated in this batch
|
68 |
batch_summary = {}
|
@@ -86,12 +147,57 @@ async def evaluate():
|
|
86 |
for task_data in batch:
|
87 |
task_func, model, bcp_47, sentence_nr = task_data
|
88 |
batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
# Reduced delay between batches (optimized for GitHub Actions)
|
93 |
await asyncio.sleep(0.5)
|
94 |
|
|
|
95 |
results = all_results
|
96 |
# Filter out exceptions and flatten results
|
97 |
valid_results = []
|
@@ -108,7 +214,7 @@ async def evaluate():
|
|
108 |
print(f"β οΈ Encountered {exception_count} API errors (model unavailable/rate limits)")
|
109 |
print(f"β
Successfully processed {len(valid_results)} evaluations")
|
110 |
|
111 |
-
# Save
|
112 |
if valid_results:
|
113 |
results = valid_results
|
114 |
args = dict(orient="records", indent=2, force_ascii=False)
|
@@ -124,6 +230,7 @@ async def evaluate():
|
|
124 |
# Merge with old results
|
125 |
old_results = pd.read_json("results.json")
|
126 |
results_df = pd.concat([old_results, results_df])
|
|
|
127 |
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
128 |
results_df.to_json("results.json", **args)
|
129 |
print(f"πΎ Saved {len(results_df)} aggregated results to results.json")
|
@@ -153,10 +260,10 @@ async def evaluate():
|
|
153 |
print(f"β
Full evaluation completed in {elapsed_str}")
|
154 |
print(f"π Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
155 |
|
156 |
-
#
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
|
161 |
return results
|
162 |
|
|
|
11 |
|
12 |
results = pd.DataFrame()
|
13 |
|
14 |
+
def save_checkpoint(results_df, models_df, languages_df, batch_num, total_batches):
|
15 |
+
"""Save current progress as checkpoint"""
|
16 |
+
try:
|
17 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
18 |
+
|
19 |
+
# Save current results
|
20 |
+
if len(results_df) > 0:
|
21 |
+
results_df.to_json("results.json", **args)
|
22 |
+
print(f"πΎ Checkpoint saved: {len(results_df)} results (batch {batch_num}/{total_batches})")
|
23 |
+
|
24 |
+
# Save model and language info
|
25 |
+
models_df.to_json("models.json", **args)
|
26 |
+
languages_df.to_json("languages.json", **args)
|
27 |
+
|
28 |
+
# Save checkpoint metadata
|
29 |
+
checkpoint_info = {
|
30 |
+
"last_batch": batch_num,
|
31 |
+
"total_batches": total_batches,
|
32 |
+
"timestamp": datetime.now().isoformat(),
|
33 |
+
"results_count": len(results_df)
|
34 |
+
}
|
35 |
+
with open("checkpoint.json", "w") as f:
|
36 |
+
json.dump(checkpoint_info, f, indent=2)
|
37 |
+
|
38 |
+
except Exception as e:
|
39 |
+
print(f"β οΈ Failed to save checkpoint: {e}")
|
40 |
+
|
41 |
+
def load_checkpoint():
|
42 |
+
"""Load previous checkpoint if available"""
|
43 |
+
try:
|
44 |
+
if os.path.exists("checkpoint.json"):
|
45 |
+
with open("checkpoint.json", "r") as f:
|
46 |
+
checkpoint = json.load(f)
|
47 |
+
print(f"π Found checkpoint from batch {checkpoint['last_batch']}/{checkpoint['total_batches']}")
|
48 |
+
return checkpoint
|
49 |
+
except Exception as e:
|
50 |
+
print(f"β οΈ Failed to load checkpoint: {e}")
|
51 |
+
return None
|
52 |
+
|
53 |
async def evaluate():
|
54 |
# FIXME we should not need this for-loop, but it helps
|
55 |
n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
|
|
|
68 |
top_languages = languages.head(max_languages) # Top N by population
|
69 |
print(f"π Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
|
70 |
|
71 |
+
# Load checkpoint if available
|
72 |
+
checkpoint = load_checkpoint()
|
73 |
+
start_batch = 0
|
74 |
+
if checkpoint:
|
75 |
+
start_batch = checkpoint['last_batch']
|
76 |
+
print(f"π Resuming from batch {start_batch}")
|
77 |
+
|
78 |
# For testing, just use all available languages up to max_languages
|
79 |
for n_languages in [min(max_languages, len(top_languages))]:
|
80 |
print(f"running evaluations for {n_languages} languages")
|
81 |
+
|
82 |
+
# Load existing results
|
83 |
+
try:
|
84 |
+
old_results = pd.read_json("results.json")
|
85 |
+
if old_results.empty:
|
86 |
+
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
87 |
+
except FileNotFoundError:
|
88 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
89 |
+
|
90 |
+
try:
|
91 |
+
old_models = pd.read_json("models.json")
|
92 |
+
except FileNotFoundError:
|
93 |
+
old_models = pd.DataFrame()
|
94 |
+
|
95 |
# get all combinations of model, language and task
|
96 |
combis = [
|
97 |
(model, lang.bcp_47, task_name)
|
|
|
116 |
batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
|
117 |
all_results = []
|
118 |
|
119 |
+
# Calculate total batches for progress tracking
|
120 |
+
total_batches = (len(all_tasks) + batch_size - 1) // batch_size
|
121 |
+
|
122 |
+
for i in range(start_batch * batch_size, len(all_tasks), batch_size):
|
123 |
batch = all_tasks[i:i+batch_size]
|
124 |
+
current_batch = i // batch_size + 1
|
125 |
+
|
126 |
+
print(f"π¦ Processing batch {current_batch}/{total_batches} ({len(batch)} tasks)")
|
127 |
|
128 |
# Show what's being evaluated in this batch
|
129 |
batch_summary = {}
|
|
|
147 |
for task_data in batch:
|
148 |
task_func, model, bcp_47, sentence_nr = task_data
|
149 |
batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
|
150 |
+
|
151 |
+
try:
|
152 |
+
batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
|
153 |
+
all_results.extend(batch_results)
|
154 |
+
|
155 |
+
# Save checkpoint after each batch
|
156 |
+
valid_results = []
|
157 |
+
exception_count = 0
|
158 |
+
for r in batch_results:
|
159 |
+
if isinstance(r, Exception):
|
160 |
+
exception_count += 1
|
161 |
+
continue
|
162 |
+
if isinstance(r, list):
|
163 |
+
valid_results.extend(r)
|
164 |
+
else:
|
165 |
+
valid_results.append(r)
|
166 |
+
|
167 |
+
if valid_results:
|
168 |
+
# Aggregate results
|
169 |
+
batch_df = pd.DataFrame(valid_results)
|
170 |
+
if len(batch_df) > 0:
|
171 |
+
batch_df = (
|
172 |
+
batch_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
173 |
+
.agg({"score": "mean"})
|
174 |
+
.reset_index()
|
175 |
+
)
|
176 |
+
# Merge with existing results
|
177 |
+
all_results_df = pd.concat([old_results, batch_df])
|
178 |
+
all_results_df = all_results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
179 |
+
all_results_df = all_results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
180 |
+
|
181 |
+
# Save checkpoint
|
182 |
+
save_checkpoint(all_results_df, models_df, languages_df, current_batch, total_batches)
|
183 |
+
|
184 |
+
# Update old_results for next batch
|
185 |
+
old_results = all_results_df
|
186 |
+
|
187 |
+
print(f"β
Batch {current_batch} completed: {len(valid_results)} valid results, {exception_count} errors")
|
188 |
+
|
189 |
+
except Exception as e:
|
190 |
+
print(f"β Batch {current_batch} failed: {e}")
|
191 |
+
# Save checkpoint even on failure
|
192 |
+
if len(all_results) > 0:
|
193 |
+
results_df = pd.DataFrame(all_results)
|
194 |
+
save_checkpoint(results_df, models_df, languages_df, current_batch, total_batches)
|
195 |
+
continue
|
196 |
|
197 |
# Reduced delay between batches (optimized for GitHub Actions)
|
198 |
await asyncio.sleep(0.5)
|
199 |
|
200 |
+
# Final aggregation and save
|
201 |
results = all_results
|
202 |
# Filter out exceptions and flatten results
|
203 |
valid_results = []
|
|
|
214 |
print(f"β οΈ Encountered {exception_count} API errors (model unavailable/rate limits)")
|
215 |
print(f"β
Successfully processed {len(valid_results)} evaluations")
|
216 |
|
217 |
+
# Save final results
|
218 |
if valid_results:
|
219 |
results = valid_results
|
220 |
args = dict(orient="records", indent=2, force_ascii=False)
|
|
|
230 |
# Merge with old results
|
231 |
old_results = pd.read_json("results.json")
|
232 |
results_df = pd.concat([old_results, results_df])
|
233 |
+
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
234 |
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
235 |
results_df.to_json("results.json", **args)
|
236 |
print(f"πΎ Saved {len(results_df)} aggregated results to results.json")
|
|
|
260 |
print(f"β
Full evaluation completed in {elapsed_str}")
|
261 |
print(f"π Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
262 |
|
263 |
+
# Clean up checkpoint file on successful completion
|
264 |
+
if os.path.exists("checkpoint.json"):
|
265 |
+
os.remove("checkpoint.json")
|
266 |
+
print("π§Ή Cleaned up checkpoint file")
|
267 |
|
268 |
return results
|
269 |
|