|
import asyncio |
|
import pandas as pd |
|
import time |
|
from datetime import datetime, timedelta |
|
from models import models |
|
from tasks import tasks |
|
from languages import languages |
|
import os |
|
|
|
|
|
async def evaluate(): |
|
|
|
n_sentences = int( |
|
os.environ.get("N_SENTENCES", 20) |
|
) |
|
max_languages = int( |
|
os.environ.get("MAX_LANGUAGES", 150) |
|
) |
|
single_model = os.environ.get( |
|
"SINGLE_MODEL" |
|
) |
|
test_mode = os.environ.get("TEST", "").lower() in ( |
|
"1", |
|
"true", |
|
"yes", |
|
) |
|
|
|
|
|
original_models_df = pd.DataFrame(models) |
|
original_languages_df = pd.DataFrame(languages) |
|
|
|
|
|
models_df = original_models_df.copy() |
|
languages_df = original_languages_df.copy() |
|
top_languages = languages.head(max_languages) |
|
|
|
|
|
if single_model: |
|
models_df = models_df[models_df["id"] == single_model] |
|
if len(models_df) == 0: |
|
print(f"Error: Model '{single_model}' not found. Available models:") |
|
for model_id in original_models_df["id"]: |
|
print(f" {model_id}") |
|
return pd.DataFrame() |
|
|
|
print( |
|
f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task" |
|
) |
|
if test_mode: |
|
print("TEST MODE: Skipping results loading/saving") |
|
start_time = time.time() |
|
|
|
|
|
if test_mode: |
|
old_results = pd.DataFrame( |
|
columns=["model", "bcp_47", "task", "metric", "origin", "score"] |
|
) |
|
else: |
|
old_results = pd.read_json("results.json") |
|
|
|
|
|
combis = [ |
|
(model, lang.bcp_47, task_name) |
|
for model in models_df["id"] |
|
for lang in top_languages.itertuples() |
|
for task_name, task in tasks.items() |
|
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0] |
|
] |
|
|
|
|
|
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"]) |
|
if not old_results.empty: |
|
completed = set(old_results[["model", "bcp_47", "task"]].apply(tuple, axis=1)) |
|
|
|
mask = ~combis.apply( |
|
lambda row: (row["model"], row["bcp_47"], row["task"]) in completed, axis=1 |
|
) |
|
combis = combis[mask] |
|
|
|
|
|
all_tasks = [] |
|
for i in range(n_sentences): |
|
for model, bcp_47, task_name in combis.itertuples(index=False): |
|
all_tasks.append((tasks[task_name], model, bcp_47, i)) |
|
|
|
print(f"Running {len(all_tasks)} evaluation tasks...") |
|
|
|
|
|
|
|
stop_on_error = single_model is not None |
|
|
|
|
|
batch_size = 1000 |
|
all_results = [] |
|
|
|
try: |
|
for i in range(0, len(all_tasks), batch_size): |
|
batch = all_tasks[i : i + batch_size] |
|
batch_results = await asyncio.gather( |
|
*[ |
|
task_func(model, bcp_47, sentence_nr) |
|
for task_func, model, bcp_47, sentence_nr in batch |
|
], |
|
return_exceptions=not stop_on_error, |
|
) |
|
all_results.extend(batch_results) |
|
|
|
results = all_results |
|
|
|
|
|
valid_results = [] |
|
errors = [] |
|
|
|
for i, r in enumerate(results): |
|
if isinstance(r, Exception): |
|
if i < len(all_tasks): |
|
task_info = all_tasks[i] |
|
errors.append(f"{task_info[1]},{task_info[2]},{str(r)}") |
|
elif isinstance(r, list): |
|
valid_results.extend(r) |
|
elif r is not None: |
|
valid_results.append(r) |
|
|
|
|
|
if errors: |
|
with open("errors.log", "w") as f: |
|
f.write("model,task,error\n") |
|
for error in errors: |
|
f.write(error + "\n") |
|
|
|
|
|
if valid_results: |
|
completed_models = set() |
|
for result in valid_results: |
|
if isinstance(result, dict) and "model" in result: |
|
model = result["model"] |
|
if model not in completed_models: |
|
completed_models.add(model) |
|
print(f"Completed: {model}") |
|
|
|
print(f"Completed: {len(valid_results)} valid results, {len(errors)} errors") |
|
|
|
|
|
except Exception as e: |
|
print(f"EVALUATION STOPPED - API Error occurred:") |
|
print(f"Error type: {type(e).__name__}") |
|
print(f"Error message: {str(e)}") |
|
return pd.DataFrame() |
|
|
|
|
|
if valid_results: |
|
results_df = pd.DataFrame(valid_results) |
|
|
|
|
|
results_df = ( |
|
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"]) |
|
.agg({"score": "mean"}) |
|
.reset_index() |
|
) |
|
|
|
if not test_mode: |
|
args = dict(orient="records", indent=2, force_ascii=False) |
|
|
|
|
|
if not old_results.empty: |
|
results_df = pd.concat([old_results, results_df]) |
|
results_df = results_df.drop_duplicates( |
|
subset=["model", "bcp_47", "task", "metric", "origin"] |
|
) |
|
|
|
results_df = results_df.sort_values( |
|
by=["model", "bcp_47", "task", "metric"] |
|
) |
|
results_df.to_json("results.json", **args) |
|
|
|
|
|
original_models_df.to_json("models.json", **args) |
|
original_languages_df.to_json("languages.json", **args) |
|
else: |
|
print("TEST MODE: Skipping results saving") |
|
|
|
elapsed = time.time() - start_time |
|
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}") |
|
|
|
return results_df |
|
|
|
return pd.DataFrame() |
|
|
|
|
|
if __name__ == "__main__": |
|
results = asyncio.run(evaluate()) |
|
|