# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """SemScore metric""" import evaluate import datasets import torch from transformers import AutoTokenizer, AutoModel from tqdm import tqdm _CITATION = """\ @misc{semscore, title={SemScore: Automated Evaluation of Instruction-Tuned LLMs based on Semantic Textual Similarity}, author={Ansar Aynetdinov and Alan Akbik}, year={2024}, eprint={2401.17072}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2401.17072}, } """ _DESCRIPTION = """\ SemScore measures semantic textual similarity between candidate and reference texts. """ _KWARGS_DESCRIPTION = """ Calculates how good are predictions given some references, using certain scores Args: predictions (list of str): list of predictions (instruction completions) to score. Each prediction should be a string. references (list of str): list of references (target completions). Each reference should be a string. batch_size (int): the batch size for predictions. device (str): CPU/GPU device. Returns: semscore: aggregated system-level SemScore, similarities: cosine similarities between individual prediction-reference pairs, Examples: >>> predictions = ['This is an example sentence', 'Each sentence is considered'] >>> references = ['This is an example sentence', 'Each sentence is considered'] >>> semscore = evaluate.load("semscore") >>> results = semscore.compute(predictions=predictions, references=references) >>> print(results['semscore']) 100.0 """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class SemScore(evaluate.Metric): """TODO: Short description of my evaluation module.""" def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features({ 'predictions': datasets.Value('string'), 'references': datasets.Value('string'), }), # Additional link to the reference reference_urls=["https://arxiv.org/abs/2401.17072"] ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" if self.config_name == "default": checkpoint = "sentence-transformers/all-mpnet-base-v2" else: checkpoint = self.config_name # Load model and tokenizer from HuggingFace Hub self.model = AutoModel.from_pretrained(checkpoint) self.model.eval() self.tokenizer = AutoTokenizer.from_pretrained(checkpoint) @staticmethod def _mean_pooling(model_output, attention_mask): """Mean pooling over all tokens - take attention mask into account for correct averaging""" token_embeddings = model_output[0] input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def _compute( self, predictions, references, batch_size=32, device=None, ): """Returns the scores""" assert len(predictions) == len(references), "predictions and references should have the same length." if device is not None: if "cuda" in device: assert torch.cuda.is_available() self.model.to(device) else: device = "cpu" pooled_refs, pooled_preds = [], [] with torch.inference_mode(): for i in tqdm(range(0, len(references), batch_size), desc="Processing batches"): batch_refs = references[i : i + batch_size] batch_preds = predictions[i : i + batch_size] encoded_refs = self.tokenizer(batch_refs, padding=True, truncation=True, return_tensors='pt') encoded_preds = self.tokenizer(batch_preds, padding=True, truncation=True, return_tensors='pt') model_output_refs = self.model(**encoded_refs.to(device)) model_output_preds = self.model(**encoded_preds.to(device)) batch_pooled_refs = self._mean_pooling(model_output_refs, encoded_refs['attention_mask']) batch_pooled_preds = self._mean_pooling(model_output_preds, encoded_preds['attention_mask']) pooled_refs.append(batch_pooled_refs) pooled_preds.append(batch_pooled_preds) pooled_refs, pooled_preds = torch.cat(pooled_refs), torch.cat(pooled_preds) similarities = torch.nn.functional.cosine_similarity(pooled_refs, pooled_preds) similarities = similarities * 100 semscore = torch.mean(similarities) return { "semscore": round(semscore.item(), 2), "similarities": similarities.tolist() }