demo / backend /tests /run_lighteval.py
tfrere's picture
update lighteval results
39acd70
raw
history blame
1.52 kB
import os
import tempfile
import subprocess
from dotenv import load_dotenv
import time
from lighteval_task.lighteval_task import create_yourbench_task
import datetime
# Load environment variables
load_dotenv()
# Create temporary task file
temp_file_path = tempfile.mktemp(suffix=".py")
with open(temp_file_path, 'w') as temp_file:
temp_file.write("""
from lighteval_task.lighteval_task import create_yourbench_task
# Create yourbench task
yourbench = create_yourbench_task("yourbench/yourbench_fbfe278f-70c8-4579-9447-8275b94250bd", "single_shot_questions")
# Define TASKS_TABLE needed by lighteval
TASKS_TABLE = [yourbench]
""")
# Créer un dossier de sortie avec timestamp pour éviter d'écraser les anciens résultats
output_dir = f"data/lighteval_results_strict_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
# LightEval command
cmd_args = [
"lighteval",
"endpoint",
"inference-providers",
"model=Qwen/Qwen2.5-72B-Instruct,provider=novita",
"custom|yourbench|0|0",
"--custom-tasks",
temp_file_path,
"--max-samples", "10",
"--output-dir", output_dir,
"--save-details",
"--no-push-to-hub"
]
# Start timer
start_time = time.time()
# Run the command with environment variables
subprocess.run(cmd_args, env=os.environ)
# Calculate and print execution time
execution_time = time.time() - start_time
print(f"\nTemps d'exécution : {execution_time:.2f} secondes")
print(f"Résultats sauvegardés dans : {output_dir}")
# Clean up
os.unlink(temp_file_path)