Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

demo / backend /tests /run_parallel_lighteval.py

tfrere

update lighteval results

39acd70 4 months ago

raw

history blame

4.53 kB

	import tempfile
	import time
	import subprocess
	import os
	import json
	from pathlib import Path
	import concurrent.futures
	from dotenv import load_dotenv
	from datetime import datetime
	import yaml
	import argparse
	from typing import Dict, Any
	from tqdm import tqdm
	from tools.lighteval.get_model_providers import get_model_providers

	def run_lighteval(model_name: str, provider: str) -> dict:
	start_time = time.time()
	print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")

	# Create temporary task file
	temp_file_path = tempfile.mktemp(suffix=".py")
	with open(temp_file_path, 'w') as temp_file:
	temp_file.write("""
	from lighteval_task.lighteval_task import create_yourbench_task

	# Create yourbench task
	yourbench = create_yourbench_task("yourbench/yourbench_test", "single_shot_questions")

	# Define TASKS_TABLE needed by lighteval
	TASKS_TABLE = [yourbench]
	""")

	# LightEval command
	cmd_args = [
	"lighteval",
	"endpoint",
	"inference-providers",
	f"model={model_name},provider={provider}",
	"custom\|yourbench\|0\|0",
	"--custom-tasks",
	temp_file_path,
	"--max-samples", "3",
	"--output-dir", "data/lighteval_results",
	# "--save-details",
	"--no-push-to-hub"
	]

	try:
	# Run the command with environment variables and timeout of 60 seconds
	subprocess.run(cmd_args, env=os.environ, timeout=60)
	except subprocess.TimeoutExpired:
	print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
	return {
	"model": model_name,
	"provider": provider,
	"accuracy": 0.0,
	"execution_time": 60.0,
	"status": "timeout"
	}

	# Calculate execution time
	execution_time = time.time() - start_time
	print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")

	# Clean up
	os.unlink(temp_file_path)

	try:
	# Get results from the output file
	results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
	results_file = next(results_dir.glob("results_*.json"))

	with open(results_file) as f:
	results = json.load(f)
	accuracy = results["results"]["all"]["accuracy"]

	return {
	"model": model_name,
	"provider": provider,
	"accuracy": accuracy,
	"execution_time": execution_time,
	"status": "success"
	}
	except Exception as e:
	print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
	return {
	"model": model_name,
	"provider": provider,
	"accuracy": 0.0,
	"execution_time": execution_time,
	"status": "parse_error"
	}

	def main():
	# Start global timer
	script_start_time = time.time()

	# Load environment variables
	load_dotenv()

	# Models to evaluate
	models = [
	"Qwen/QwQ-32B",
	"Qwen/Qwen2.5-72B-Instruct",
	"deepseek-ai/DeepSeek-V3-0324",
	"deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
	]

	# Get providers for each model
	model_providers = get_model_providers(models)

	print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")

	# Run evaluations in parallel using ProcessPoolExecutor
	with concurrent.futures.ProcessPoolExecutor() as executor:
	futures = [
	executor.submit(run_lighteval, model_name, providers[0])
	for model_name, providers in model_providers
	if providers # Only run if providers are available
	]
	results = [future.result() for future in concurrent.futures.as_completed(futures)]

	# Calculate total script execution time
	total_time = time.time() - script_start_time
	print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")

	# Print results in order
	print("\nResults:")
	print("-" * 80)
	for result in results:
	print(f"Model: {result['model']}")
	print(f"Provider: {result['provider']}")
	print(f"Accuracy: {result['accuracy']:.2f}")
	print(f"Execution time: {result['execution_time']:.2f}s")
	print("-" * 80)

	if __name__ == "__main__":
	main()