""" Task to run evaluation using lighteval """ import os import time import subprocess import tempfile from pathlib import Path import concurrent.futures from dotenv import load_dotenv from datetime import datetime import json from typing import List, Dict from tasks.get_model_providers import get_model_providers from huggingface_hub import HfApi import asyncio class EvaluationTask: """ Task to run evaluation using lighteval """ def __init__(self, session_uid: str, dataset_name: str): """ Initialize the evaluation task Args: session_uid: Session ID for this task dataset_name: Name of the dataset to evaluate """ self.session_uid = session_uid self.dataset_name = dataset_name self.is_completed = False self.results = [] self.hf_api = HfApi() def _save_results_to_hub(self) -> None: """ Save evaluation results directly to the dataset on the Hub without persisting locally """ try: # Créer un fichier temporaire pour les résultats with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file: json.dump(self.results, temp_file, indent=2) temp_file_path = temp_file.name # Push to Hub self.hf_api.upload_file( path_or_fileobj=temp_file_path, path_in_repo="lighteval_results.json", repo_id=self.dataset_name, repo_type="dataset", commit_message="Add lighteval evaluation results" ) print(f"[{datetime.now().strftime('%H:%M:%S')}] Results saved to Hub at {self.dataset_name}/lighteval_results.json") # Supprimer le fichier temporaire os.unlink(temp_file_path) except Exception as e: print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to save results to Hub: {str(e)}") async def _run_lighteval(self, model_name: str, provider: str, dataset_name: str) -> dict: start_time = time.time() print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}") # Create temporary task file temp_file_path = tempfile.mktemp(suffix=".py") with open(temp_file_path, 'w') as temp_file: temp_file.write(f""" from lighteval_task.lighteval_task import create_yourbench_task # Create yourbench task yourbench = create_yourbench_task("{dataset_name}", "multi_hop_questions") # Define TASKS_TABLE needed by lighteval TASKS_TABLE = [yourbench] """) # Create temporary output directory temp_output_dir = tempfile.mkdtemp(prefix="lighteval_") # LightEval command cmd_args = [ "lighteval", "endpoint", "inference-providers", f"model={model_name},provider={provider}", "custom|yourbench|0|0", "--custom-tasks", temp_file_path, "--max-samples", "30", "--output-dir", temp_output_dir, "--no-push-to-hub" ] try: # Run the command with environment variables and increased timeout of 300 seconds process = await asyncio.create_subprocess_exec( *cmd_args, env=os.environ, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) try: await asyncio.wait_for(process.communicate(), timeout=60) except asyncio.TimeoutError: process.kill() print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s") # Clean up temporary files and directories os.unlink(temp_file_path) import shutil shutil.rmtree(temp_output_dir, ignore_errors=True) return { "model": model_name, "provider": provider, "accuracy": 0.0, "execution_time": 60.0, "status": "timeout" } except Exception as e: print(f"[{datetime.now().strftime('%H:%M:%S')}] Error running evaluation for {model_name}: {str(e)}") # Clean up temporary files and directories os.unlink(temp_file_path) import shutil shutil.rmtree(temp_output_dir, ignore_errors=True) return { "model": model_name, "provider": provider, "accuracy": 0.0, "execution_time": time.time() - start_time, "status": "error" } # Calculate execution time execution_time = time.time() - start_time print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s") try: # Get results from the output file results_dir = Path(temp_output_dir) / "results" / model_name.replace("/", "/") results_file = next(results_dir.glob("results_*.json")) with open(results_file) as f: results = json.load(f) accuracy = results["results"]["all"]["accuracy"] result_data = { "model": model_name, "provider": provider, "accuracy": accuracy, "execution_time": execution_time, "status": "success" } except Exception as e: print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}") result_data = { "model": model_name, "provider": provider, "accuracy": 0.0, "execution_time": execution_time, "status": "parse_error" } # Clean up temporary files and directories os.unlink(temp_file_path) import shutil shutil.rmtree(temp_output_dir, ignore_errors=True) return result_data async def run(self) -> None: """ Run the evaluation task asynchronously """ # Start global timer script_start_time = time.time() # Load environment variables load_dotenv() # Models to evaluate models = [ "Qwen/QwQ-32B", "Qwen/Qwen2.5-72B-Instruct", "deepseek-ai/DeepSeek-V3-0324", "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", ] # Get providers for each model model_providers = get_model_providers(models) print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations") # Run evaluations in parallel using asyncio tasks = [] for model_name, providers in model_providers: if providers: # Only run if providers are available tasks.append(self._run_lighteval(model_name, providers[0], self.dataset_name)) self.results = await asyncio.gather(*tasks) # Calculate total script execution time total_time = time.time() - script_start_time print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s") # Cleanup intermediate results if they exist if os.path.exists("data/lighteval_results"): print(f"[{datetime.now().strftime('%H:%M:%S')}] Cleaning up intermediate results") try: # Recursively delete intermediate results import shutil shutil.rmtree("data/lighteval_results", ignore_errors=True) except Exception as e: print(f"[{datetime.now().strftime('%H:%M:%S')}] Warning: Failed to clean up intermediate results: {str(e)}") # Save final results to Hub (only once) self._save_results_to_hub() # Mark the task as completed self.is_completed = True def get_logs(self) -> List[str]: """ Get logs for this task (empty list since we don't track logs anymore) Returns: Empty list of logs """ return [] def is_task_completed(self) -> bool: """ Check if the task is completed Returns: True if completed, False otherwise """ return self.is_completed