Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import json | |
| import logging | |
| from lighteval.logging.evaluation_tracker import EvaluationTracker | |
| from lighteval.models.model_config import InferenceEndpointModelConfig | |
| from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters | |
| from src.envs import RESULTS_REPO | |
| from src.backend.manage_requests import EvalRequest | |
| from src.logging import setup_logger | |
| logging.getLogger("openai").setLevel(logging.WARNING) | |
| logger = setup_logger(__name__) | |
| def run_evaluation( | |
| eval_request: EvalRequest, | |
| task_names: str, | |
| batch_size: int, | |
| local_dir: str, | |
| accelerator: str, | |
| region: str, | |
| vendor: str, | |
| instance_size: str, | |
| instance_type: str, | |
| limit=None, | |
| ): | |
| """Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub. | |
| Args: | |
| eval_request (EvalRequest): Input evaluation request file representation | |
| task_names (list): Tasks to launch | |
| batch_size (int): Selected batch size | |
| accelerator (str): Inference endpoint parameter for running the evaluation | |
| region (str): Inference endpoint parameter for running the evaluation | |
| vendor (str): Inference endpoint parameter for running the evaluation | |
| instance_size (str): Inference endpoint parameter for running the evaluation | |
| instance_type (str): Inference endpoint parameter for running the evaluation | |
| local_dir (str): Where to save the results locally | |
| no_cache (bool, optional): Whether to use a cache or not. | |
| limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging | |
| """ | |
| if limit: | |
| logger.info( | |
| "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." | |
| ) | |
| evaluation_tracker = EvaluationTracker( | |
| output_dir="./results", | |
| save_details=True, | |
| push_to_hub=True, | |
| push_to_tensorboard=False, | |
| hub_results_org=RESULTS_REPO, | |
| public=False, | |
| ) | |
| pipeline_params = PipelineParameters( | |
| launcher_type=ParallelismManager.ACCELERATE, | |
| override_batch_size=batch_size, | |
| max_samples=limit, | |
| use_chat_template=False, | |
| system_prompt=None, | |
| custom_tasks_directory="custom_tasks.py", # if using a custom task | |
| ) | |
| model_config = InferenceEndpointModelConfig( | |
| # Endpoint parameters | |
| name=eval_request.model.replace(".", "-").lower(), | |
| repository=eval_request.model, | |
| accelerator=accelerator, | |
| vendor=vendor, | |
| region=region, | |
| instance_size=instance_size, | |
| instance_type=instance_type, | |
| should_reuse_existing=False, | |
| model_dtype=eval_request.precision, | |
| revision=eval_request.revision, | |
| ) | |
| pipeline = Pipeline( | |
| tasks=task_names, | |
| pipeline_parameters=pipeline_params, | |
| evaluation_tracker=evaluation_tracker, | |
| model_config=model_config, | |
| ) | |
| try: | |
| pipeline.evaluate() | |
| pipeline.show_results() | |
| pipeline.save_and_push_results() | |
| results = pipeline.get_results() | |
| dumped = json.dumps(results, indent=2) | |
| logger.info(dumped) | |
| except Exception: # if eval failed, we force a cleanup | |
| pipeline.model.cleanup() | |
| return results | |