Spaces:

aynetdia
/

semscore

Running

App Files Files Community

semscore / semscore.py

aynetdia

torch.nn.functional fix

5b86628 4 days ago

raw

history blame contribute delete

5.84 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""SemScore metric"""

	import evaluate
	import datasets
	import torch
	from transformers import AutoTokenizer, AutoModel
	from tqdm import tqdm

	_CITATION = """\
	@misc{semscore,
	title={SemScore: Automated Evaluation of Instruction-Tuned LLMs based on Semantic Textual Similarity},
	author={Ansar Aynetdinov and Alan Akbik},
	year={2024},
	eprint={2401.17072},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2401.17072},
	}
	"""

	_DESCRIPTION = """\
	SemScore measures semantic textual similarity between candidate and reference texts.
	"""


	_KWARGS_DESCRIPTION = """
	Calculates how good are predictions given some references, using certain scores
	Args:
	predictions (list of str): list of predictions (instruction completions) to score. Each prediction
	should be a string.
	references (list of str): list of references (target completions). Each reference should be a string.
	batch_size (int): the batch size for predictions.
	device (str): CPU/GPU device.
	Returns:
	semscore: aggregated system-level SemScore,
	similarities: cosine similarities between individual prediction-reference pairs,
	Examples:
	>>> predictions = ['This is an example sentence', 'Each sentence is considered']
	>>> references = ['This is an example sentence', 'Each sentence is considered']
	>>> semscore = evaluate.load("semscore")
	>>> results = semscore.compute(predictions=predictions, references=references)
	>>> print(results['semscore'])
	100.0
	"""


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class SemScore(evaluate.Metric):
	"""TODO: Short description of my evaluation module."""

	def _info(self):
	# TODO: Specifies the evaluate.EvaluationModuleInfo object
	return evaluate.MetricInfo(
	# This is the description that will appear on the modules page.
	module_type="metric",
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	# This defines the format of each prediction and reference
	features=datasets.Features({
	'predictions': datasets.Value('string'),
	'references': datasets.Value('string'),
	}),
	# Additional link to the reference
	reference_urls=["https://arxiv.org/abs/2401.17072"]
	)

	def _download_and_prepare(self, dl_manager):
	"""Optional: download external resources useful to compute the scores"""
	if self.config_name == "default":
	checkpoint = "sentence-transformers/all-mpnet-base-v2"
	else:
	checkpoint = self.config_name
	# Load model and tokenizer from HuggingFace Hub
	self.model = AutoModel.from_pretrained(checkpoint)
	self.model.eval()
	self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)

	@staticmethod
	def _mean_pooling(model_output, attention_mask):
	"""Mean pooling over all tokens - take attention mask into account for correct averaging"""
	token_embeddings = model_output[0]
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

	def _compute(
	self,
	predictions,
	references,
	batch_size=32,
	device=None,
	):
	"""Returns the scores"""

	assert len(predictions) == len(references), "predictions and references should have the same length."
	if device is not None:
	if "cuda" in device:
	assert torch.cuda.is_available()
	self.model.to(device)
	else:
	device = "cpu"

	pooled_refs, pooled_preds = [], []

	with torch.inference_mode():
	for i in tqdm(range(0, len(references), batch_size), desc="Processing batches"):
	batch_refs = references[i : i + batch_size]
	batch_preds = predictions[i : i + batch_size]
	encoded_refs = self.tokenizer(batch_refs, padding=True, truncation=True, return_tensors='pt')
	encoded_preds = self.tokenizer(batch_preds, padding=True, truncation=True, return_tensors='pt')
	model_output_refs = self.model(**encoded_refs.to(device))
	model_output_preds = self.model(**encoded_preds.to(device))
	batch_pooled_refs = self._mean_pooling(model_output_refs, encoded_refs['attention_mask'])
	batch_pooled_preds = self._mean_pooling(model_output_preds, encoded_preds['attention_mask'])
	pooled_refs.append(batch_pooled_refs)
	pooled_preds.append(batch_pooled_preds)
	pooled_refs, pooled_preds = torch.cat(pooled_refs), torch.cat(pooled_preds)

	similarities = torch.nn.functional.cosine_similarity(pooled_refs, pooled_preds)
	similarities = similarities * 100
	semscore = torch.mean(similarities)

	return {
	"semscore": round(semscore.item(), 2),
	"similarities": similarities.tolist()
	}