# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""SemScore metric"""

import evaluate
import datasets
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

_CITATION = """\
@misc{semscore,
    title={SemScore: Automated Evaluation of Instruction-Tuned LLMs based on Semantic Textual Similarity}, 
    author={Ansar Aynetdinov and Alan Akbik},
    year={2024},
    eprint={2401.17072},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2401.17072}, 
}
"""

_DESCRIPTION = """\
SemScore measures semantic textual similarity between candidate and reference texts.
"""


_KWARGS_DESCRIPTION = """
Calculates how good are predictions given some references, using certain scores
Args:
    predictions (list of str): list of predictions (instruction completions) to score. Each prediction
        should be a string.
    references (list of str): list of references (target completions). Each reference should be a string.
    batch_size (int): the batch size for predictions.
    device (str): CPU/GPU device.
Returns:
    semscore: aggregated system-level SemScore,
    similarities: cosine similarities between individual prediction-reference pairs,
Examples:
    >>> predictions = ['This is an example sentence', 'Each sentence is considered']
    >>> references = ['This is an example sentence', 'Each sentence is considered']
    >>> semscore = evaluate.load("semscore")
    >>> results = semscore.compute(predictions=predictions, references=references)
    >>> print(results['semscore'])
    100.0
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class SemScore(evaluate.Metric):
    """TODO: Short description of my evaluation module."""

    def _info(self):
        # TODO: Specifies the evaluate.EvaluationModuleInfo object
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features({
                'predictions': datasets.Value('string'),
                'references': datasets.Value('string'),
            }),
            # Additional link to the reference
            reference_urls=["https://arxiv.org/abs/2401.17072"]
        )

    def _download_and_prepare(self, dl_manager):
        """Optional: download external resources useful to compute the scores"""
        if self.config_name == "default":
            checkpoint = "sentence-transformers/all-mpnet-base-v2"
        else:
            checkpoint = self.config_name
        # Load model and tokenizer from HuggingFace Hub
        self.model = AutoModel.from_pretrained(checkpoint)
        self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    @staticmethod
    def _mean_pooling(model_output, attention_mask):
        """Mean pooling over all tokens - take attention mask into account for correct averaging"""
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def _compute(
        self, 
        predictions, 
        references,
        batch_size=32,
        device=None,
        ):
        """Returns the scores"""

        assert len(predictions) == len(references), "predictions and references should have the same length."
        if device is not None:
            if "cuda" in device:
                assert torch.cuda.is_available()
            self.model.to(device)
        else:
            device = "cpu"

        pooled_refs, pooled_preds = [], []

        with torch.inference_mode():
            for i in tqdm(range(0, len(references), batch_size), desc="Processing batches"):
                batch_refs = references[i : i + batch_size]
                batch_preds = predictions[i : i + batch_size]
                encoded_refs = self.tokenizer(batch_refs, padding=True, truncation=True, return_tensors='pt')
                encoded_preds = self.tokenizer(batch_preds, padding=True, truncation=True, return_tensors='pt')
                model_output_refs = self.model(**encoded_refs.to(device))
                model_output_preds = self.model(**encoded_preds.to(device))
                batch_pooled_refs = self._mean_pooling(model_output_refs, encoded_refs['attention_mask'])
                batch_pooled_preds = self._mean_pooling(model_output_preds, encoded_preds['attention_mask'])
                pooled_refs.append(batch_pooled_refs)
                pooled_preds.append(batch_pooled_preds)
        pooled_refs, pooled_preds = torch.cat(pooled_refs), torch.cat(pooled_preds)

        similarities = torch.nn.functional.cosine_similarity(pooled_refs, pooled_preds)
        similarities = similarities * 100
        semscore = torch.mean(similarities)

        return {
            "semscore": round(semscore.item(), 2),
            "similarities": similarities.tolist()
        }