backend

Running on CPU Upgrade

App Files Files Community

backend / src /backend /run_eval_suite_lighteval.py

albertvillanova HF Staff

Fix E402 module-import-not-at-top-of-file

d295ed3 verified about 1 year ago

raw

history blame

3.34 kB

	import json
	import logging

	from lighteval.logging.evaluation_tracker import EvaluationTracker
	from lighteval.models.model_config import InferenceEndpointModelConfig
	from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters


	from src.envs import RESULTS_REPO
	from src.backend.manage_requests import EvalRequest
	from src.logging import setup_logger

	logging.getLogger("openai").setLevel(logging.WARNING)
	logger = setup_logger(__name__)


	def run_evaluation(
	eval_request: EvalRequest,
	task_names: str,
	batch_size: int,
	local_dir: str,
	accelerator: str,
	region: str,
	vendor: str,
	instance_size: str,
	instance_type: str,
	limit=None,
	):
	"""Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.

	Args:
	eval_request (EvalRequest): Input evaluation request file representation
	task_names (list): Tasks to launch
	batch_size (int): Selected batch size
	accelerator (str): Inference endpoint parameter for running the evaluation
	region (str): Inference endpoint parameter for running the evaluation
	vendor (str): Inference endpoint parameter for running the evaluation
	instance_size (str): Inference endpoint parameter for running the evaluation
	instance_type (str): Inference endpoint parameter for running the evaluation
	local_dir (str): Where to save the results locally
	no_cache (bool, optional): Whether to use a cache or not.
	limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
	"""

	if limit:
	logger.info(
	"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
	)

	evaluation_tracker = EvaluationTracker(
	output_dir="./results",
	save_details=True,
	push_to_hub=True,
	push_to_tensorboard=False,
	hub_results_org=RESULTS_REPO,
	public=False,
	)

	pipeline_params = PipelineParameters(
	launcher_type=ParallelismManager.ACCELERATE,
	override_batch_size=batch_size,
	max_samples=limit,
	use_chat_template=False,
	system_prompt=None,
	custom_tasks_directory="custom_tasks.py", # if using a custom task
	)

	model_config = InferenceEndpointModelConfig(
	# Endpoint parameters
	name=eval_request.model.replace(".", "-").lower(),
	repository=eval_request.model,
	accelerator=accelerator,
	vendor=vendor,
	region=region,
	instance_size=instance_size,
	instance_type=instance_type,
	should_reuse_existing=False,
	model_dtype=eval_request.precision,
	revision=eval_request.revision,
	)

	pipeline = Pipeline(
	tasks=task_names,
	pipeline_parameters=pipeline_params,
	evaluation_tracker=evaluation_tracker,
	model_config=model_config,
	)

	try:
	pipeline.evaluate()
	pipeline.show_results()
	pipeline.save_and_push_results()
	results = pipeline.get_results()

	dumped = json.dumps(results, indent=2)
	logger.info(dumped)

	except Exception: # if eval failed, we force a cleanup
	pipeline.model.cleanup()

	return results