Spaces:
Running
Running
| # %% | |
| try: | |
| from ipytorch import logging | |
| except Exception as e: | |
| import logging | |
| from typing import Any, Optional, Protocol, Iterable, Callable | |
| from tqdm.auto import tqdm | |
| from evaluate.evaluation_suite import EvaluationSuite | |
| # %% | |
| # %cd ../tlem | |
| # %load_ext ipytorch | |
| # %ls | |
| from utils import ( | |
| NUMERIC_IN_ZH, | |
| extract_choice_ans, | |
| extract_numeric, | |
| get_answer, | |
| is_equiv, | |
| ) | |
| from dataclasses import dataclass, field | |
| from datasets import load_dataset, Dataset | |
| from functools import cached_property | |
| TextGenerationPipeline = Callable[[Iterable[str]], list[str]] | |
| from evaluate import load | |
| def fake_pipeline(prompts: Iterable[str]) -> list[str]: | |
| return [prompt for prompt in tqdm(prompts)] | |
| class Task: | |
| dataset_name: str | tuple[str, str] = ("gsm8k", "main") | |
| split: str = "test" | |
| # metrics: list[str] = field(default_factory=list) | |
| metric_name: str | tuple[str, str] = ("sustech/tlem", "gsm8k") | |
| input_column: str = "question" | |
| label_column: str = "answer" | |
| prompt: Optional[Callable | str] = None | |
| def samples(self): | |
| return self.dataset[self.input_column] | |
| def dataset(self): | |
| ds = load_dataset( | |
| *self.dataset_name | |
| if isinstance(self.dataset_name, tuple) | |
| else self.dataset_name, | |
| split=self.split | |
| ) | |
| if self.prompt is not None: | |
| ds = ds.map( | |
| lambda example: { | |
| self.input_column: self.prompt.format( | |
| input_column=example[self.input_column] | |
| ) | |
| } | |
| if isinstance(self.prompt, str) | |
| else self.prompt(example), | |
| ) | |
| return ds | |
| def metric(self): | |
| metric = ( | |
| load(self.metric_name) | |
| if isinstance(self.metric_name, str) | |
| else load(*self.metric_name) | |
| ) | |
| return metric | |
| def run(self, pipeline: TextGenerationPipeline = fake_pipeline): | |
| outputs = pipeline(self.samples) | |
| return self.metric.compute( | |
| responses=outputs, references=self.dataset[self.label_column] | |
| ) | |
| class Metrics: | |
| def gsm8k(responses: list[str], answers: list[str | int]): | |
| scores = [] | |
| for response, answer in zip(responses, answers): | |
| pred = extract_numeric(response) | |
| gold = extract_numeric(answer) if isinstance(answer, str) else str(answer) | |
| scores.append(1.0 * (pred == gold)) | |
| return scores | |
| def MATH(responses: list[str], answers: list[str]): | |
| scores = [] | |
| for response, answer in zip(responses, answers): | |
| indices = [pos for pos, char in enumerate(response) if char == "$"] | |
| if len(indices) <= 2: | |
| scores.append(0) | |
| continue | |
| else: | |
| result = response[indices[-2] + 1 : indices[-1]] | |
| gold = get_answer(answer) | |
| scores.append(1.0 * is_equiv(result, gold)) | |
| return scores | |
| def math23k(responses: list[str], answers: list[str]): | |
| scores = [] | |
| for response, answer in zip(responses, answers): | |
| pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) | |
| gold = extract_numeric(answer, pattern=NUMERIC_IN_ZH) | |
| scores.append(1.0 * (pred == gold)) | |
| return scores | |
| def gsm8k_zh(responses: list[str], answers: list[str]): | |
| scores = [] | |
| for response, answer in zip(responses, answers): | |
| pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) | |
| gold = extract_numeric(answer) | |
| scores.append(1.0 * (pred == gold)) | |
| return scores | |
| def svamp(responses: list[float], answers: list[str]): | |
| scores = [] | |
| for response, answer in zip(responses, answers): | |
| pred = extract_numeric(response, pattern=NUMERIC_IN_ZH) | |
| gold = answer | |
| scores.append(1.0 * (float(pred) == gold)) | |
| return scores | |
| def mmlu(responses, answers): | |
| scores = [] | |
| for response, answer in zip(responses, answers): | |
| pred = extract_choice_ans(response) | |
| gold = answer.lower() | |
| scores.append(1.0 * (pred == gold)) | |
| return scores | |
| import evaluate | |
| import numpy as np | |
| import datasets | |
| # TODO: Add BibTeX citation | |
| _CITATION = """\ | |
| @InProceedings{huggingface:module, | |
| title = {A great new module}, | |
| authors={huggingface, Inc.}, | |
| year={2020} | |
| } | |
| """ | |
| # TODO: Add description of the module here | |
| _DESCRIPTION = """\ | |
| A simple measurement that returns the number of elements in dataset. | |
| """ | |
| # TODO: Add description of the arguments of the module here | |
| _KWARGS_DESCRIPTION = """ | |
| Calculates number of elements in dataset | |
| Args: | |
| data: list of elements. | |
| Returns: | |
| element_count: number of elements in dataset, | |
| Examples: | |
| >>> measure = evaluate.load("lvwerra/element_count") | |
| >>> measure.compute(["a", "b", "c") | |
| {"element_count": 3} | |
| """ | |
| # TODO: Define external resources urls if needed | |
| BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" | |
| class ReasoningMetric(evaluate.Metric): | |
| """TODO: Short description of my evaluation module.""" | |
| def _info(self): | |
| features = datasets.Features( | |
| { | |
| "responses": datasets.Value("string"), | |
| "references": datasets.Value("string"), | |
| } | |
| ) | |
| if self.config_name == "svamp": | |
| features = datasets.Features( | |
| { | |
| "responses": datasets.Value("string"), | |
| "references": datasets.Value("float"), | |
| } | |
| ) | |
| # TODO: Specifies the evaluate.EvaluationModuleInfo object | |
| return evaluate.EvaluationModuleInfo( | |
| # This is the description that will appear on the modules page. | |
| # module_type="measurement", | |
| description=_DESCRIPTION, | |
| citation=_CITATION, | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| # This defines the format of each prediction and reference | |
| features=features, | |
| # Homepage of the module for documentation | |
| homepage="http://module.homepage", | |
| # Additional links to the codebase or references | |
| codebase_urls=["http://github.com/path/to/codebase/of/new_module"], | |
| reference_urls=["http://path.to.reference.url/new_module"], | |
| ) | |
| def _compute(self, responses, references, verbose=False): | |
| results = {} | |
| scores = getattr(Metrics, self.config_name)(responses, references) | |
| acc = np.asarray(scores).mean() | |
| results = { | |
| "accuracy": acc, | |
| "scores": scores, | |
| } | |
| if verbose: | |
| results["references"] = references | |
| results["answers"] = responses | |
| # results["scores"] = scores | |
| return results | |
| class Suite(EvaluationSuite): | |
| def run( | |
| self, model_or_pipeline: Any, prompt: str = "{instruction}" | |
| ) -> dict[str, float]: | |
| self.assert_suite_nonempty() | |
| results_all = {} | |
| for task in tqdm(self.suite, desc="Running tasks"): | |
| task_name = task.name | |
| results = task.run(model_or_pipeline) | |
| results_all[task_name] = results | |
| return results_all | |
| def __init__(self, name): | |
| super().__init__(name) | |
| self.suite = [ | |
| Task( | |
| dataset_name=("gsm8k", "main"), | |
| metric_name=("sustech/tlem", "gsm8k"), | |
| input_column="question", | |
| label_column="answer", | |
| ) | |
| # TASK_REGISTRY["gsm8k"], | |
| # TASK_REGISTRY["competition_math"], | |
| ] | |
| # %% | |
| if __name__ == "__main__": | |
| # metric = load("sustech/tlem", "gsm8k") | |
| # output = metric.compute(responses=["answer is 2", "1+2"], references=["2", "3"]) | |
| # logging.info(output) | |
| suite = EvaluationSuite.load("sustech/tlem") | |
| suite.run(fake_pipeline) | |
| # %% | |