davidpomerenke commited on
Commit
c2eeeac
Β·
verified Β·
1 Parent(s): 02f927b

Upload from GitHub Actions: Merge pull request #10 from datenlabor-bmz/jn-dev

Browse files
Files changed (2) hide show
  1. .github/workflows/nightly-evals.yml +24 -2
  2. evals/main.py +119 -12
.github/workflows/nightly-evals.yml CHANGED
@@ -8,6 +8,7 @@ on:
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
 
11
  steps:
12
  - uses: actions/checkout@v3
13
 
@@ -21,7 +22,7 @@ jobs:
21
  curl -LsSf https://astral.sh/uv/install.sh | sh
22
  uv sync --frozen --extra dev
23
 
24
- - name: Run evaluations
25
  env:
26
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
27
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
@@ -30,7 +31,28 @@ jobs:
30
  run: |
31
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
32
  uv run evals/download_data.py
33
- uv run evals/main.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  - name: Commit changes
36
  env:
 
8
  jobs:
9
  run-evals:
10
  runs-on: ubuntu-latest
11
+ timeout-minutes: 1440 # 24 hours timeout
12
  steps:
13
  - uses: actions/checkout@v3
14
 
 
22
  curl -LsSf https://astral.sh/uv/install.sh | sh
23
  uv sync --frozen --extra dev
24
 
25
+ - name: Run evaluations with checkpointing
26
  env:
27
  OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
28
  HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
 
31
  run: |
32
  uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
33
  uv run evals/download_data.py
34
+
35
+ # Run evaluations with periodic checkpointing
36
+ uv run python -c "
37
+ import time
38
+ import subprocess
39
+ import json
40
+ import os
41
+
42
+ # Check if we have existing results to resume from
43
+ if os.path.exists('results.json'):
44
+ print('Found existing results.json, will resume from checkpoint')
45
+
46
+ # Run the main evaluation
47
+ try:
48
+ subprocess.run(['uv', 'run', 'evals/main.py'], check=True)
49
+ except subprocess.CalledProcessError as e:
50
+ print(f'Evaluation failed: {e}')
51
+ # Save current state even if failed
52
+ if os.path.exists('results.json'):
53
+ print('Saving checkpoint before exit...')
54
+ exit(1)
55
+ "
56
 
57
  - name: Commit changes
58
  env:
evals/main.py CHANGED
@@ -11,6 +11,45 @@ import json
11
 
12
  results = pd.DataFrame()
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  async def evaluate():
15
  # FIXME we should not need this for-loop, but it helps
16
  n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
@@ -29,13 +68,30 @@ async def evaluate():
29
  top_languages = languages.head(max_languages) # Top N by population
30
  print(f"🌍 Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
31
 
 
 
 
 
 
 
 
32
  # For testing, just use all available languages up to max_languages
33
  for n_languages in [min(max_languages, len(top_languages))]:
34
  print(f"running evaluations for {n_languages} languages")
35
- old_results = pd.read_json("results.json")
36
- if old_results.empty:
 
 
 
 
 
37
  old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
38
- old_models = pd.read_json("models.json")
 
 
 
 
 
39
  # get all combinations of model, language and task
40
  combis = [
41
  (model, lang.bcp_47, task_name)
@@ -60,9 +116,14 @@ async def evaluate():
60
  batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
61
  all_results = []
62
 
63
- for i in range(0, len(all_tasks), batch_size):
 
 
 
64
  batch = all_tasks[i:i+batch_size]
65
- print(f"πŸ“¦ Processing batch {i//batch_size + 1}/{(len(all_tasks) + batch_size - 1)//batch_size} ({len(batch)} tasks)")
 
 
66
 
67
  # Show what's being evaluated in this batch
68
  batch_summary = {}
@@ -86,12 +147,57 @@ async def evaluate():
86
  for task_data in batch:
87
  task_func, model, bcp_47, sentence_nr = task_data
88
  batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
89
- batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
90
- all_results.extend(batch_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  # Reduced delay between batches (optimized for GitHub Actions)
93
  await asyncio.sleep(0.5)
94
 
 
95
  results = all_results
96
  # Filter out exceptions and flatten results
97
  valid_results = []
@@ -108,7 +214,7 @@ async def evaluate():
108
  print(f"⚠️ Encountered {exception_count} API errors (model unavailable/rate limits)")
109
  print(f"βœ… Successfully processed {len(valid_results)} evaluations")
110
 
111
- # Save partial results even if some failed
112
  if valid_results:
113
  results = valid_results
114
  args = dict(orient="records", indent=2, force_ascii=False)
@@ -124,6 +230,7 @@ async def evaluate():
124
  # Merge with old results
125
  old_results = pd.read_json("results.json")
126
  results_df = pd.concat([old_results, results_df])
 
127
  results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
128
  results_df.to_json("results.json", **args)
129
  print(f"πŸ’Ύ Saved {len(results_df)} aggregated results to results.json")
@@ -153,10 +260,10 @@ async def evaluate():
153
  print(f"βœ… Full evaluation completed in {elapsed_str}")
154
  print(f"πŸŽ‰ Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
155
 
156
- # Save results locally
157
- with open("results.json", "w") as f:
158
- json.dump(results, f, indent=2)
159
- print(f"πŸ’Ύ Results saved to results.json")
160
 
161
  return results
162
 
 
11
 
12
  results = pd.DataFrame()
13
 
14
+ def save_checkpoint(results_df, models_df, languages_df, batch_num, total_batches):
15
+ """Save current progress as checkpoint"""
16
+ try:
17
+ args = dict(orient="records", indent=2, force_ascii=False)
18
+
19
+ # Save current results
20
+ if len(results_df) > 0:
21
+ results_df.to_json("results.json", **args)
22
+ print(f"πŸ’Ύ Checkpoint saved: {len(results_df)} results (batch {batch_num}/{total_batches})")
23
+
24
+ # Save model and language info
25
+ models_df.to_json("models.json", **args)
26
+ languages_df.to_json("languages.json", **args)
27
+
28
+ # Save checkpoint metadata
29
+ checkpoint_info = {
30
+ "last_batch": batch_num,
31
+ "total_batches": total_batches,
32
+ "timestamp": datetime.now().isoformat(),
33
+ "results_count": len(results_df)
34
+ }
35
+ with open("checkpoint.json", "w") as f:
36
+ json.dump(checkpoint_info, f, indent=2)
37
+
38
+ except Exception as e:
39
+ print(f"⚠️ Failed to save checkpoint: {e}")
40
+
41
+ def load_checkpoint():
42
+ """Load previous checkpoint if available"""
43
+ try:
44
+ if os.path.exists("checkpoint.json"):
45
+ with open("checkpoint.json", "r") as f:
46
+ checkpoint = json.load(f)
47
+ print(f"πŸ“‚ Found checkpoint from batch {checkpoint['last_batch']}/{checkpoint['total_batches']}")
48
+ return checkpoint
49
+ except Exception as e:
50
+ print(f"⚠️ Failed to load checkpoint: {e}")
51
+ return None
52
+
53
  async def evaluate():
54
  # FIXME we should not need this for-loop, but it helps
55
  n_sentences = int(os.environ.get("N_SENTENCES", 15)) # Default 1 for quick testing
 
68
  top_languages = languages.head(max_languages) # Top N by population
69
  print(f"🌍 Evaluating top {len(top_languages)} languages by speakers (max: {max_languages})")
70
 
71
+ # Load checkpoint if available
72
+ checkpoint = load_checkpoint()
73
+ start_batch = 0
74
+ if checkpoint:
75
+ start_batch = checkpoint['last_batch']
76
+ print(f"πŸ”„ Resuming from batch {start_batch}")
77
+
78
  # For testing, just use all available languages up to max_languages
79
  for n_languages in [min(max_languages, len(top_languages))]:
80
  print(f"running evaluations for {n_languages} languages")
81
+
82
+ # Load existing results
83
+ try:
84
+ old_results = pd.read_json("results.json")
85
+ if old_results.empty:
86
+ old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
87
+ except FileNotFoundError:
88
  old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
89
+
90
+ try:
91
+ old_models = pd.read_json("models.json")
92
+ except FileNotFoundError:
93
+ old_models = pd.DataFrame()
94
+
95
  # get all combinations of model, language and task
96
  combis = [
97
  (model, lang.bcp_47, task_name)
 
116
  batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
117
  all_results = []
118
 
119
+ # Calculate total batches for progress tracking
120
+ total_batches = (len(all_tasks) + batch_size - 1) // batch_size
121
+
122
+ for i in range(start_batch * batch_size, len(all_tasks), batch_size):
123
  batch = all_tasks[i:i+batch_size]
124
+ current_batch = i // batch_size + 1
125
+
126
+ print(f"πŸ“¦ Processing batch {current_batch}/{total_batches} ({len(batch)} tasks)")
127
 
128
  # Show what's being evaluated in this batch
129
  batch_summary = {}
 
147
  for task_data in batch:
148
  task_func, model, bcp_47, sentence_nr = task_data
149
  batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
150
+
151
+ try:
152
+ batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
153
+ all_results.extend(batch_results)
154
+
155
+ # Save checkpoint after each batch
156
+ valid_results = []
157
+ exception_count = 0
158
+ for r in batch_results:
159
+ if isinstance(r, Exception):
160
+ exception_count += 1
161
+ continue
162
+ if isinstance(r, list):
163
+ valid_results.extend(r)
164
+ else:
165
+ valid_results.append(r)
166
+
167
+ if valid_results:
168
+ # Aggregate results
169
+ batch_df = pd.DataFrame(valid_results)
170
+ if len(batch_df) > 0:
171
+ batch_df = (
172
+ batch_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
173
+ .agg({"score": "mean"})
174
+ .reset_index()
175
+ )
176
+ # Merge with existing results
177
+ all_results_df = pd.concat([old_results, batch_df])
178
+ all_results_df = all_results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
179
+ all_results_df = all_results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
180
+
181
+ # Save checkpoint
182
+ save_checkpoint(all_results_df, models_df, languages_df, current_batch, total_batches)
183
+
184
+ # Update old_results for next batch
185
+ old_results = all_results_df
186
+
187
+ print(f"βœ… Batch {current_batch} completed: {len(valid_results)} valid results, {exception_count} errors")
188
+
189
+ except Exception as e:
190
+ print(f"❌ Batch {current_batch} failed: {e}")
191
+ # Save checkpoint even on failure
192
+ if len(all_results) > 0:
193
+ results_df = pd.DataFrame(all_results)
194
+ save_checkpoint(results_df, models_df, languages_df, current_batch, total_batches)
195
+ continue
196
 
197
  # Reduced delay between batches (optimized for GitHub Actions)
198
  await asyncio.sleep(0.5)
199
 
200
+ # Final aggregation and save
201
  results = all_results
202
  # Filter out exceptions and flatten results
203
  valid_results = []
 
214
  print(f"⚠️ Encountered {exception_count} API errors (model unavailable/rate limits)")
215
  print(f"βœ… Successfully processed {len(valid_results)} evaluations")
216
 
217
+ # Save final results
218
  if valid_results:
219
  results = valid_results
220
  args = dict(orient="records", indent=2, force_ascii=False)
 
230
  # Merge with old results
231
  old_results = pd.read_json("results.json")
232
  results_df = pd.concat([old_results, results_df])
233
+ results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
234
  results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
235
  results_df.to_json("results.json", **args)
236
  print(f"πŸ’Ύ Saved {len(results_df)} aggregated results to results.json")
 
260
  print(f"βœ… Full evaluation completed in {elapsed_str}")
261
  print(f"πŸŽ‰ Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
262
 
263
+ # Clean up checkpoint file on successful completion
264
+ if os.path.exists("checkpoint.json"):
265
+ os.remove("checkpoint.json")
266
+ print("🧹 Cleaned up checkpoint file")
267
 
268
  return results
269