File size: 7,037 Bytes
da6e1bc
4d13673
a0d1624
 
8274634
da6e1bc
a0d1624
 
 
c2eeeac
a0d1624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c06aef
a0d1624
 
 
c790fdb
a0d1624
 
 
 
088f96f
a0d1624
 
 
 
 
 
 
 
da6e1bc
a0d1624
 
 
 
 
 
 
 
 
 
 
 
 
c790fdb
a0d1624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c790fdb
a0d1624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da6e1bc
088f96f
da6e1bc
2f9dee1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import asyncio
import pandas as pd
import time
from datetime import datetime, timedelta
from models import models
from tasks import tasks
from languages import languages
import os


async def evaluate():
    # Configuration - easily adjustable defaults
    n_sentences = int(
        os.environ.get("N_SENTENCES", 20)
    )  # Default: 20 sentences per task
    max_languages = int(
        os.environ.get("MAX_LANGUAGES", 150)
    )  # Default: 150 top languages
    single_model = os.environ.get(
        "SINGLE_MODEL"
    )  # Optional: run only one specific model
    test_mode = os.environ.get("TEST", "").lower() in (
        "1",
        "true",
        "yes",
    )  # Optional: skip results loading/saving

    # Keep original DataFrames for saving metadata - distinction added for single model test runs.
    original_models_df = pd.DataFrame(models)
    original_languages_df = pd.DataFrame(languages)

    # Create working copies for single evaluation runs
    models_df = original_models_df.copy()
    languages_df = original_languages_df.copy()
    top_languages = languages.head(max_languages)

    # Filter to single model if specified (only affects evaluation, not saving)
    if single_model:
        models_df = models_df[models_df["id"] == single_model]
        if len(models_df) == 0:
            print(f"Error: Model '{single_model}' not found. Available models:")
            for model_id in original_models_df["id"]:
                print(f"  {model_id}")
            return pd.DataFrame()

    print(
        f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task"
    )
    if test_mode:
        print("TEST MODE: Skipping results loading/saving")
    start_time = time.time()

    # Load existing results to avoid re-evaluation (skip in test mode)
    if test_mode:
        old_results = pd.DataFrame(
            columns=["model", "bcp_47", "task", "metric", "origin", "score"]
        )
    else:
        old_results = pd.read_json("results.json")

    # Get all combinations that need evaluation
    combis = [
        (model, lang.bcp_47, task_name)
        for model in models_df["id"]
        for lang in top_languages.itertuples()
        for task_name, task in tasks.items()
        if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
    ]

    # Filter out already evaluated combinations
    combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
    if not old_results.empty:
        completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1))
        # set + combis is faster than merge (locally it made a difference for me when loading all data/tasks into memory)
        mask = ~combis.apply(
            lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1
        )
        combis = combis[mask]

    # Create all evaluation tasks
    all_tasks = []
    for i in range(n_sentences):
        for model, bcp_47, task_name in combis.itertuples(index=False):
            all_tasks.append((tasks[task_name], model, bcp_47, i))

    print(f"Running {len(all_tasks)} evaluation tasks...")

    # For single model runs, we stop immediately on first API error to inspect.
    # For full evaluations, we continue despite errors to get maximum coverage.
    stop_on_error = single_model is not None

    # Process tasks in batches to avoid memory issues (for full evaluation locally that helped a lot)
    batch_size = 1000
    all_results = []

    try:
        for i in range(0, len(all_tasks), batch_size):
            batch = all_tasks[i : i + batch_size]
            batch_results = await asyncio.gather(
                *[
                    task_func(model, bcp_47, sentence_nr)
                    for task_func, model, bcp_47, sentence_nr in batch
                ],
                return_exceptions=not stop_on_error,
            )
            all_results.extend(batch_results)

        results = all_results

        # Process results and logging API errors separately to understand what are the main issues.
        valid_results = []
        errors = []

        for i, r in enumerate(results):
            if isinstance(r, Exception):
                if i < len(all_tasks):
                    task_info = all_tasks[i]
                    errors.append(f"{task_info[1]},{task_info[2]},{str(r)}")
            elif isinstance(r, list):
                valid_results.extend(r)
            elif r is not None:
                valid_results.append(r)

        # log errors and store
        if errors:
            with open("errors.log", "w") as f:
                f.write("model,task,error\n")
                for error in errors:
                    f.write(error + "\n")

        # Track model completion (TO BE DELETED - was for local run only)
        if valid_results:
            completed_models = set()
            for result in valid_results:
                if isinstance(result, dict) and "model" in result:
                    model = result["model"]
                    if model not in completed_models:
                        completed_models.add(model)
                        print(f"Completed: {model}")

        print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors")

    # this is for local single model runs - for testing and development
    except Exception as e:
        print(f"EVALUATION STOPPED - API Error occurred:")
        print(f"Error type: {type(e).__name__}")
        print(f"Error message: {str(e)}")
        return pd.DataFrame()

    # Save results (skipped in test mode as we do not want to overwrite existing results)
    if valid_results:
        results_df = pd.DataFrame(valid_results)

        # Aggregate results
        results_df = (
            results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
            .agg({"score": "mean"})
            .reset_index()
        )

        if not test_mode:
            args = dict(orient="records", indent=2, force_ascii=False)

            # Merge with existing results
            if not old_results.empty:
                results_df = pd.concat([old_results, results_df])
                results_df = results_df.drop_duplicates(
                    subset=["model", "bcp_47", "task", "metric", "origin"]
                )

            results_df = results_df.sort_values(
                by=["model", "bcp_47", "task", "metric"]
            )
            results_df.to_json("results.json", **args)

            # Save model and language info (always save complete metadata, not filtered)
            original_models_df.to_json("models.json", **args)
            original_languages_df.to_json("languages.json", **args)
        else:
            print("TEST MODE: Skipping results saving")

        elapsed = time.time() - start_time
        print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")

        return results_df

    return pd.DataFrame()


if __name__ == "__main__":
    results = asyncio.run(evaluate())