# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """SignWriting Similarity metric from the signwriting-evaluation package""" import evaluate import datasets from signwriting_evaluation.metrics.similarity import SignWritingSimilarityMetric _CITATION = """\ @misc{moryossef2024signwritingevaluationeffectivesignlanguage, title={signwriting-evaluation: Effective Sign Language Evaluation via SignWriting}, author={Amit Moryossef and Rotem Zilberman and Ohad Langer}, year={2024}, eprint={2410.13668}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2410.13668}, } """ _DESCRIPTION = """\ SignWriting Similarity metric from the signwriting-evaluation package """ _KWARGS_DESCRIPTION = """ Produces similarity scores for hypotheses given reference translations. Args: predictions (list of str): The predicted sentences. references (list of list of str): The references. There should be one reference sub-list for each prediction sentence. Returns: score (float): The similarity score between 0 and 1 Examples: Example 1 -- basic similarity score: >>> predictions = ["M530x538S37602508x462S15a11493x494S20e00488x510S22f03469x517"] >>> references = [["M519x534S37900497x466S3770b497x485S15a51491x501S22f03481x513"]] >>> metric = evaluate.load("signwriting_similarity") >>> results = metric.compute(predictions=predictions, references=references) >>> print(results) {'score': 0.5509574768254414} Example 2 -- identical signs in different order: >>> predictions = ["M530x538S37602508x462S15a11493x494S20e00488x510S22f03469x517"] >>> references = [["M530x538S22f03469x517S37602508x462S20e00488x510S15a11493x494"]] >>> metric = evaluate.load("signwriting_similarity") >>> results = metric.compute(predictions=predictions, references=references) >>> print(results) {'score': 1.0} Example 3 -- slightly different symbols: >>> predictions = ["M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517"] >>> references = [["M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517"]] >>> metric = evaluate.load("signwriting_similarity") >>> results = metric.compute(predictions=predictions, references=references) >>> print(results) {'score': 0.8326259781509948} Example 4 -- multiple references, one good and one bad: >>> predictions = ["M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517"] >>> references = [["M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517"], ["M530x538S17600508x462"]] >>> metric = evaluate.load("signwriting_similarity") >>> results = metric.compute(predictions=predictions, references=references) >>> print(results) {'score': 0.8326259781509948} Example 5 -- multiple signs in hypothesis: >>> predictions = ["M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517 M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517"] >>> references = [["M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517"]] >>> metric = evaluate.load("signwriting_similarity") >>> results = metric.compute(predictions=predictions, references=references) >>> print(results) {'score': 0.4163129890754974} Example 6 -- sign order does not affect similarity: >>> predictions = ["M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517 M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517"] >>> references = [["M530x538S17600508x462S12a11493x494S20e00488x510S22f13469x517 M530x538S17600508x462S15a11493x494S20e00488x510S22f03469x517"]] >>> metric = evaluate.load("signwriting_similarity") >>> results = metric.compute(predictions=predictions, references=references) >>> print(results) {'score': 1.0} Example 7 -- invalid FSW input should result in 0 score: >>> predictions = ["MMp483"] >>> references = [["MMp483"]] >>> metric = evaluate.load("signwriting_similarity") >>> results = metric.compute(predictions=predictions, references=references) >>> print(results) {'score': 0.0} """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class SignWritingSimilarity(evaluate.Metric): metric = SignWritingSimilarityMetric() def _info(self): return evaluate.MetricInfo( module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, homepage="https://github.com/sign-language-processing/signwriting-evaluation", features=[ datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), } ), datasets.Features( { "predictions": datasets.Value("string", id="sequence"), "references": datasets.Value("string", id="sequence"), } ), ], codebase_urls=["https://github.com/sign-language-processing/signwriting-evaluation"], reference_urls=[ "https://github.com/sign-language-processing/signwriting-evaluation", ], ) def _compute(self, predictions, references): # the internal array is as long as the predictions, the external one is for multiple references. references = list(zip(*references)) score = self.metric.corpus_score(predictions, references) return {"score": score}