semscore / semscore.py
aynetdia's picture
torch.nn.functional fix
5b86628
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""SemScore metric"""
import evaluate
import datasets
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
_CITATION = """\
@misc{semscore,
title={SemScore: Automated Evaluation of Instruction-Tuned LLMs based on Semantic Textual Similarity},
author={Ansar Aynetdinov and Alan Akbik},
year={2024},
eprint={2401.17072},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2401.17072},
}
"""
_DESCRIPTION = """\
SemScore measures semantic textual similarity between candidate and reference texts.
"""
_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
predictions (list of str): list of predictions (instruction completions) to score. Each prediction
should be a string.
references (list of str): list of references (target completions). Each reference should be a string.
batch_size (int): the batch size for predictions.
device (str): CPU/GPU device.
Returns:
semscore: aggregated system-level SemScore,
similarities: cosine similarities between individual prediction-reference pairs,
Examples:
>>> predictions = ['This is an example sentence', 'Each sentence is considered']
>>> references = ['This is an example sentence', 'Each sentence is considered']
>>> semscore = evaluate.load("semscore")
>>> results = semscore.compute(predictions=predictions, references=references)
>>> print(results['semscore'])
100.0
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class SemScore(evaluate.Metric):
"""TODO: Short description of my evaluation module."""
def _info(self):
# TODO: Specifies the evaluate.EvaluationModuleInfo object
return evaluate.MetricInfo(
# This is the description that will appear on the modules page.
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features({
'predictions': datasets.Value('string'),
'references': datasets.Value('string'),
}),
# Additional link to the reference
reference_urls=["https://arxiv.org/abs/2401.17072"]
)
def _download_and_prepare(self, dl_manager):
"""Optional: download external resources useful to compute the scores"""
if self.config_name == "default":
checkpoint = "sentence-transformers/all-mpnet-base-v2"
else:
checkpoint = self.config_name
# Load model and tokenizer from HuggingFace Hub
self.model = AutoModel.from_pretrained(checkpoint)
self.model.eval()
self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
@staticmethod
def _mean_pooling(model_output, attention_mask):
"""Mean pooling over all tokens - take attention mask into account for correct averaging"""
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def _compute(
self,
predictions,
references,
batch_size=32,
device=None,
):
"""Returns the scores"""
assert len(predictions) == len(references), "predictions and references should have the same length."
if device is not None:
if "cuda" in device:
assert torch.cuda.is_available()
self.model.to(device)
else:
device = "cpu"
pooled_refs, pooled_preds = [], []
with torch.inference_mode():
for i in tqdm(range(0, len(references), batch_size), desc="Processing batches"):
batch_refs = references[i : i + batch_size]
batch_preds = predictions[i : i + batch_size]
encoded_refs = self.tokenizer(batch_refs, padding=True, truncation=True, return_tensors='pt')
encoded_preds = self.tokenizer(batch_preds, padding=True, truncation=True, return_tensors='pt')
model_output_refs = self.model(**encoded_refs.to(device))
model_output_preds = self.model(**encoded_preds.to(device))
batch_pooled_refs = self._mean_pooling(model_output_refs, encoded_refs['attention_mask'])
batch_pooled_preds = self._mean_pooling(model_output_preds, encoded_preds['attention_mask'])
pooled_refs.append(batch_pooled_refs)
pooled_preds.append(batch_pooled_preds)
pooled_refs, pooled_preds = torch.cat(pooled_refs), torch.cat(pooled_preds)
similarities = torch.nn.functional.cosine_similarity(pooled_refs, pooled_preds)
similarities = similarities * 100
semscore = torch.mean(similarities)
return {
"semscore": round(semscore.item(), 2),
"similarities": similarities.tolist()
}