# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""SignWriting Similarity metric from the signwriting-evaluation package"""

import evaluate
import datasets
from signwriting_evaluation.metrics.similarity import SignWritingSimilarityMetric

_CITATION = """\
@misc{moryossef2024signwritingevaluationeffectivesignlanguage,
  title={signwriting-evaluation: Effective Sign Language Evaluation via SignWriting}, 
  author={Amit Moryossef and Rotem Zilberman and Ohad Langer},
  year={2024},
  eprint={2410.13668},
  archivePrefix={arXiv},
  primaryClass={cs.CL},
  url={https://arxiv.org/abs/2410.13668}, 
}
"""

_DESCRIPTION = """\
SignWriting Similarity metric from the signwriting-evaluation package
"""

_KWARGS_DESCRIPTION = """
Produces similarity scores for hypotheses given reference translations.

Args:
    predictions (list of str): 
        The predicted sentences.
    references (list of list of str): 
        The references. There should be one reference sub-list for each prediction sentence.
Returns:
    score (float): The similarity score between 0 and 1
Examples:
    Example 1 -- basic similarity score:
        >>> predictions = ["M530x538S37602508x462S15a11493x494S20e00488x510S22f03469x517"]
        >>> references = [["M519x534S37900497x466S3770b497x485S15a51491x501S22f03481x513"]]
        >>> metric = evaluate.load("signwriting_similarity")
        >>> results = metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'score': 0.5509574768254414}

    Example 2 -- identical signs in different order:
        >>> predictions = ["M530x538S37602508x462S15a11493x494S20e00488x510S22f03469x517"]
        >>> references = [["M530x538S22f03469x517S37602508x462S20e00488x510S15a11493x494"]]
        >>> metric = evaluate.load("signwriting_similarity")
        >>> results = metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'score': 1.0}

    Example 3 -- slightly different symbols:
        >>> predictions = ["M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517"]
        >>> references = [["M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517"]]
        >>> metric = evaluate.load("signwriting_similarity")
        >>> results = metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'score': 0.8326259781509948}

    Example 4 -- multiple references, one good and one bad:
        >>> predictions = ["M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517"]
        >>> references = [["M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517"], ["M530x538S17600508x462"]]
        >>> metric = evaluate.load("signwriting_similarity")
        >>> results = metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'score': 0.8326259781509948}

    Example 5 -- multiple signs in hypothesis:
        >>> predictions = ["M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517 M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517"]
        >>> references = [["M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517"]]
        >>> metric = evaluate.load("signwriting_similarity")
        >>> results = metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'score': 0.4163129890754974}

    Example 6 -- sign order does not affect similarity:
        >>> predictions = ["M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517 M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517"]
        >>> references = [["M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517 M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517"]]
        >>> metric = evaluate.load("signwriting_similarity")
        >>> results = metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'score': 1.0}

    Example 7 -- invalid FSW input should result in 0 score:
        >>> predictions = ["M<s><s>M<s>p483"]
        >>> references = [["M<s><s>M<s>p483"]]
        >>> metric = evaluate.load("signwriting_similarity")
        >>> results = metric.compute(predictions=predictions, references=references)
        >>> print(results)
        {'score': 0.0}
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class SignWritingSimilarity(evaluate.Metric):
    metric = SignWritingSimilarityMetric()

    def _info(self):
        return evaluate.MetricInfo(
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            homepage="https://github.com/sign-language-processing/signwriting-evaluation",
            features=[
                datasets.Features(
                    {
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
                    }
                ),
                datasets.Features(
                    {
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Value("string", id="sequence"),
                    }
                ),
            ],
            codebase_urls=["https://github.com/sign-language-processing/signwriting-evaluation"],
            reference_urls=[
                "https://github.com/sign-language-processing/signwriting-evaluation",
            ],
        )

    def _compute(self, predictions, references):
        # the internal array is as long as the predictions, the external one is for multiple references.
        references = list(zip(*references))

        score = self.metric.corpus_score(predictions, references)

        return {"score": score}