Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

demo / backend /tests /run_parallel_lighteval.py

tfrere

update lighteval results

39acd70 22 days ago

raw

history blame

4.53 kB

	import tempfile
	import time
	import subprocess
	import os
	import json
	from pathlib import Path
	import concurrent.futures
	from dotenv import load_dotenv
	from datetime import datetime
	import yaml
	import argparse
	from typing import Dict, Any
	from tqdm import tqdm
	from tools.lighteval.get_model_providers import get_model_providers

	def run_lighteval(model_name: str, provider: str) -> dict:
	start_time = time.time()
	print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")

	# Create temporary task file
	temp_file_path = tempfile.mktemp(suffix=".py")
	with open(temp_file_path, 'w') as temp_file:
	temp_file.write("""
	from lighteval_task.lighteval_task import create_yourbench_task

	# Create yourbench task
	yourbench = create_yourbench_task("yourbench/yourbench_test", "single_shot_questions")

	# Define TASKS_TABLE needed by lighteval
	TASKS_TABLE = [yourbench]
	""")

	# LightEval command
	cmd_args = [
	"lighteval",
	"endpoint",
	"inference-providers",
	f"model={model_name},provider={provider}",
	"custom\|yourbench\|0\|0",
	"--custom-tasks",
	temp_file_path,
	"--max-samples", "3",
	"--output-dir", "data/lighteval_results",
	# "--save-details",
	"--no-push-to-hub"
	]

	try:
	# Run the command with environment variables and timeout of 60 seconds
	subprocess.run(cmd_args, env=os.environ, timeout=60)
	except subprocess.TimeoutExpired:
	print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
	return {
	"model": model_name,
	"provider": provider,
	"accuracy": 0.0,
	"execution_time": 60.0,
	"status": "timeout"
	}

	# Calculate execution time
	execution_time = time.time() - start_time
	print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")

	# Clean up
	os.unlink(temp_file_path)

	try:
	# Get results from the output file
	results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
	results_file = next(results_dir.glob("results_*.json"))

	with open(results_file) as f:
	results = json.load(f)
	accuracy = results["results"]["all"]["accuracy"]

	return {
	"model": model_name,
	"provider": provider,
	"accuracy": accuracy,
	"execution_time": execution_time,
	"status": "success"
	}
	except Exception as e:
	print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
	return {
	"model": model_name,
	"provider": provider,
	"accuracy": 0.0,
	"execution_time": execution_time,
	"status": "parse_error"
	}

	def main():
	# Start global timer
	script_start_time = time.time()

	# Load environment variables
	load_dotenv()

	# Models to evaluate
	models = [
	"Qwen/QwQ-32B",
	"Qwen/Qwen2.5-72B-Instruct",
	"deepseek-ai/DeepSeek-V3-0324",
	"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
	]

	# Get providers for each model
	model_providers = get_model_providers(models)

	print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")

	# Run evaluations in parallel using ProcessPoolExecutor
	with concurrent.futures.ProcessPoolExecutor() as executor:
	futures = [
	executor.submit(run_lighteval, model_name, providers[0])
	for model_name, providers in model_providers
	if providers # Only run if providers are available
	]
	results = [future.result() for future in concurrent.futures.as_completed(futures)]

	# Calculate total script execution time
	total_time = time.time() - script_start_time
	print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")

	# Print results in order
	print("\nResults:")
	print("-" * 80)
	for result in results:
	print(f"Model: {result['model']}")
	print(f"Provider: {result['provider']}")
	print(f"Accuracy: {result['accuracy']:.2f}")
	print(f"Execution time: {result['execution_time']:.2f}s")
	print("-" * 80)

	if __name__ == "__main__":
	main()