import asyncio import pandas as pd import time from datetime import datetime, timedelta from models import models from tasks import tasks from languages import languages import os async def evaluate(): # Configuration - easily adjustable defaults n_sentences = int(os.environ.get("N_SENTENCES", 20)) # Default: 20 sentences per task max_languages = int(os.environ.get("MAX_LANGUAGES", 150)) # Default: 150 top languages single_model = os.environ.get("SINGLE_MODEL") # Optional: run only one specific model test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes") # Optional: skip results loading/saving # Keep original DataFrames for saving metadata original_models_df = pd.DataFrame(models) original_languages_df = pd.DataFrame(languages) # Create working copies for single evaluation runs models_df = original_models_df.copy() languages_df = original_languages_df.copy() top_languages = languages.head(max_languages) # Filter to single model if specified (only affects evaluation, not saving) if single_model: models_df = models_df[models_df["id"] == single_model] if len(models_df) == 0: print(f"Error: Model '{single_model}' not found. Available models:") for model_id in original_models_df["id"]: print(f" {model_id}") return pd.DataFrame() print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task") if test_mode: print("TEST MODE: Skipping results loading/saving") start_time = time.time() # Load existing results to avoid re-evaluation (skip in test mode) if test_mode: old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"]) else: try: old_results = pd.read_json("results.json") if old_results.empty: old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"]) except FileNotFoundError: old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"]) # Get all combinations that need evaluation combis = [ (model, lang.bcp_47, task_name) for model in models_df["id"] for lang in top_languages.itertuples() for task_name, task in tasks.items() if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0] ] # Filter out already evaluated combinations combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"]) combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left") combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]] # Create all evaluation tasks all_tasks = [] for i in range(n_sentences): for model, bcp_47, task_name in combis.itertuples(index=False): all_tasks.append((tasks[task_name], model, bcp_47, i)) print(f"Running {len(all_tasks)} evaluation tasks...") # Run all tasks with simple asyncio.gather, but stop on first error try: results = await asyncio.gather( *[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in all_tasks], return_exceptions=False # This will raise on first exception ) # Process results - no exceptions should reach here valid_results = [] for r in results: if isinstance(r, list): valid_results.extend(r) else: valid_results.append(r) print(f"Completed: {len(valid_results)} valid results") except Exception as e: print(f"EVALUATION STOPPED - API Error occurred:") print(f"Error type: {type(e).__name__}") print(f"Error message: {str(e)}") return pd.DataFrame() # Save results (skip in test mode) if valid_results: results_df = pd.DataFrame(valid_results) # Aggregate results results_df = ( results_df.groupby(["model", "bcp_47", "task", "metric", "origin"]) .agg({"score": "mean"}) .reset_index() ) if not test_mode: args = dict(orient="records", indent=2, force_ascii=False) # Merge with existing results if not old_results.empty: results_df = pd.concat([old_results, results_df]) results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"]) results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"]) results_df.to_json("results.json", **args) # Save model and language info (always save complete metadata, not filtered) original_models_df.to_json("models.json", **args) original_languages_df.to_json("languages.json", **args) else: print("TEST MODE: Skipping results saving") elapsed = time.time() - start_time print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}") return results_df return pd.DataFrame() if __name__ == "__main__": results = asyncio.run(evaluate())