|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""SemScore metric""" |
|
|
|
import evaluate |
|
import datasets |
|
import torch |
|
from transformers import AutoTokenizer, AutoModel |
|
from tqdm import tqdm |
|
|
|
_CITATION = """\ |
|
@misc{semscore, |
|
title={SemScore: Automated Evaluation of Instruction-Tuned LLMs based on Semantic Textual Similarity}, |
|
author={Ansar Aynetdinov and Alan Akbik}, |
|
year={2024}, |
|
eprint={2401.17072}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.CL}, |
|
url={https://arxiv.org/abs/2401.17072}, |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
SemScore measures semantic textual similarity between candidate and reference texts. |
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Calculates how good are predictions given some references, using certain scores |
|
Args: |
|
predictions (list of str): list of predictions (instruction completions) to score. Each prediction |
|
should be a string. |
|
references (list of str): list of references (target completions). Each reference should be a string. |
|
batch_size (int): the batch size for predictions. |
|
device (str): CPU/GPU device. |
|
Returns: |
|
semscore: aggregated system-level SemScore, |
|
similarities: cosine similarities between individual prediction-reference pairs, |
|
Examples: |
|
>>> predictions = ['This is an example sentence', 'Each sentence is considered'] |
|
>>> references = ['This is an example sentence', 'Each sentence is considered'] |
|
>>> semscore = evaluate.load("semscore") |
|
>>> results = semscore.compute(predictions=predictions, references=references) |
|
>>> print(results['semscore']) |
|
100.0 |
|
""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class SemScore(evaluate.Metric): |
|
"""TODO: Short description of my evaluation module.""" |
|
|
|
def _info(self): |
|
|
|
return evaluate.MetricInfo( |
|
|
|
module_type="metric", |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features({ |
|
'predictions': datasets.Value('string'), |
|
'references': datasets.Value('string'), |
|
}), |
|
|
|
reference_urls=["https://arxiv.org/abs/2401.17072"] |
|
) |
|
|
|
def _download_and_prepare(self, dl_manager): |
|
"""Optional: download external resources useful to compute the scores""" |
|
if self.config_name == "default": |
|
checkpoint = "sentence-transformers/all-mpnet-base-v2" |
|
else: |
|
checkpoint = self.config_name |
|
|
|
self.model = AutoModel.from_pretrained(checkpoint) |
|
self.model.eval() |
|
self.tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
|
|
|
@staticmethod |
|
def _mean_pooling(model_output, attention_mask): |
|
"""Mean pooling over all tokens - take attention mask into account for correct averaging""" |
|
token_embeddings = model_output[0] |
|
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
|
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
|
|
def _compute( |
|
self, |
|
predictions, |
|
references, |
|
batch_size=32, |
|
device=None, |
|
): |
|
"""Returns the scores""" |
|
|
|
assert len(predictions) == len(references), "predictions and references should have the same length." |
|
if device is not None: |
|
if "cuda" in device: |
|
assert torch.cuda.is_available() |
|
self.model.to(device) |
|
else: |
|
device = "cpu" |
|
|
|
pooled_refs, pooled_preds = [], [] |
|
|
|
with torch.inference_mode(): |
|
for i in tqdm(range(0, len(references), batch_size), desc="Processing batches"): |
|
batch_refs = references[i : i + batch_size] |
|
batch_preds = predictions[i : i + batch_size] |
|
encoded_refs = self.tokenizer(batch_refs, padding=True, truncation=True, return_tensors='pt') |
|
encoded_preds = self.tokenizer(batch_preds, padding=True, truncation=True, return_tensors='pt') |
|
model_output_refs = self.model(**encoded_refs.to(device)) |
|
model_output_preds = self.model(**encoded_preds.to(device)) |
|
batch_pooled_refs = self._mean_pooling(model_output_refs, encoded_refs['attention_mask']) |
|
batch_pooled_preds = self._mean_pooling(model_output_preds, encoded_preds['attention_mask']) |
|
pooled_refs.append(batch_pooled_refs) |
|
pooled_preds.append(batch_pooled_preds) |
|
pooled_refs, pooled_preds = torch.cat(pooled_refs), torch.cat(pooled_preds) |
|
|
|
similarities = torch.nn.functional.cosine_similarity(pooled_refs, pooled_preds) |
|
similarities = similarities * 100 |
|
semscore = torch.mean(similarities) |
|
|
|
return { |
|
"semscore": round(semscore.item(), 2), |
|
"similarities": similarities.tolist() |
|
} |