import evaluate | |
import lm_eval | |
from typing import Union, List, Optional | |
from dmx.compressor.dmx import config_rules, DmxModel | |
import datasets | |
_DESCRIPTION = """ | |
Evaluation function using lm-eval with d-Matrix integration. | |
This function allows for the evaluation of language models across various tasks, | |
with the option to use d-Matrix compressed models. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Args: | |
model (str): The name or path of the model to evaluate. | |
tasks (Union[str, List[str]]): The task or list of tasks to evaluate on. | |
dmx_config (Optional[str]): Configuration string for d-Matrix transformations, defaults to None. | |
num_fewshot (Optional[int]): Number of examples in few-shot context, defaults to None. | |
batch_size (Optional[Union[int, str]]): Batch size for model, defaults to None. | |
max_batch_size (Optional[int]): Maximum batch size to try with automatic batch size detection, defaults to None. | |
limit (Optional[Union[int, float]]): Limit the number of examples per task, defaults to None. | |
revision (str): Model revision to use, defaults to 'main'. | |
trust_remote_code (bool): Whether to trust remote code, defaults to False. | |
log_samples (bool): If True, logs all model outputs and documents, defaults to True. | |
verbosity (str): Logging verbosity level, defaults to 'CRITICAL'. | |
**kwargs: Additional keyword arguments to pass to `lm_eval.evaluate`. | |
Returns: | |
dict: A dictionary containing the evaluation results. | |
""" | |
class DmxMetric(evaluate.Metric): | |
def _info(self): | |
return evaluate.MetricInfo( | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation="", | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"references": datasets.Value("string"), | |
} | |
), | |
reference_urls=["https://github.com/EleutherAI/lm-evaluation-harness"], | |
) | |
def _compute( | |
self, | |
model: str, | |
tasks: Union[str, List[str]], | |
dmx_config: Optional[str] = None, | |
num_fewshot: Optional[int] = None, | |
batch_size: Optional[Union[int, str]] = None, | |
max_batch_size: Optional[int] = None, | |
limit: Optional[Union[int, float]] = None, | |
revision: str = "main", | |
trust_remote_code: bool = False, | |
log_samples: bool = True, | |
verbosity: str = "INFO", | |
**kwargs | |
): | |
""" | |
Evaluate a model on multiple tasks and metrics using lm-eval with optional d-Matrix integration. | |
""" | |
model_args = f"pretrained={model},revision={revision},trust_remote_code={str(trust_remote_code)}" | |
lm = lm_eval.api.registry.get_model("hf").create_from_arg_string( | |
model_args, | |
{ | |
"batch_size": batch_size, | |
"max_batch_size": max_batch_size, | |
} | |
) | |
if dmx_config: | |
lm._model = DmxModel.from_torch(lm._model) | |
lm._model.transform(lm._model.dmx_config, *eval(f"config_rules.{dmx_config}")) | |
task_dict = lm_eval.tasks.get_task_dict(tasks if isinstance(tasks, list) else [tasks]) | |
for task in task_dict.values(): | |
if num_fewshot is not None: | |
task.set_config(key="num_fewshot", value=num_fewshot) | |
eval_params = { | |
'lm': lm, | |
'task_dict': task_dict, | |
'limit': limit, | |
'log_samples': log_samples, | |
'verbosity': verbosity, | |
**kwargs | |
} | |
results = lm_eval.evaluate(**eval_params) | |
return results.get('results', {}) |