Spaces:
Running
Running
| # Copyright 2020 The HuggingFace Evaluate Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ BLEU metric. """ | |
| import datasets | |
| import evaluate | |
| from .nmt_bleu import compute_bleu # From: https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py | |
| from .tokenizer_13a import Tokenizer13a | |
| _CITATION = """\ | |
| @INPROCEEDINGS{Papineni02bleu:a, | |
| author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu}, | |
| title = {BLEU: a Method for Automatic Evaluation of Machine Translation}, | |
| booktitle = {}, | |
| year = {2002}, | |
| pages = {311--318} | |
| } | |
| @inproceedings{lin-och-2004-orange, | |
| title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation", | |
| author = "Lin, Chin-Yew and | |
| Och, Franz Josef", | |
| booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics", | |
| month = "aug 23{--}aug 27", | |
| year = "2004", | |
| address = "Geneva, Switzerland", | |
| publisher = "COLING", | |
| url = "https://www.aclweb.org/anthology/C04-1072", | |
| pages = "501--507", | |
| } | |
| """ | |
| _DESCRIPTION = """\ | |
| BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. | |
| Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is" | |
| – this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics. | |
| Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations. | |
| Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality. | |
| Neither intelligibility nor grammatical correctness are not taken into account. | |
| """ | |
| _KWARGS_DESCRIPTION = """ | |
| Computes BLEU score of translated segments against one or more references. | |
| Args: | |
| predictions: list of translations to score. | |
| references: list of lists of or just a list of references for each translation. | |
| tokenizer : approach used for tokenizing `predictions` and `references`. | |
| The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT. | |
| This can be replaced by any function that takes a string as input and returns a list of tokens as output. | |
| max_order: Maximum n-gram order to use when computing BLEU score. | |
| smooth: Whether or not to apply Lin et al. 2004 smoothing. | |
| Returns: | |
| 'bleu': bleu score, | |
| 'precisions': geometric mean of n-gram precisions, | |
| 'brevity_penalty': brevity penalty, | |
| 'length_ratio': ratio of lengths, | |
| 'translation_length': translation_length, | |
| 'reference_length': reference_length | |
| Examples: | |
| >>> predictions = ["hello there general kenobi", "foo bar foobar"] | |
| >>> references = [ | |
| ... ["hello there general kenobi", "hello there!"], | |
| ... ["foo bar foobar"] | |
| ... ] | |
| >>> bleu = evaluate.load("bleu") | |
| >>> results = bleu.compute(predictions=predictions, references=references) | |
| >>> print(results["bleu"]) | |
| 1.0 | |
| """ | |
| class Bleu(evaluate.EvaluationModule): | |
| def _info(self): | |
| return evaluate.EvaluationModuleInfo( | |
| description=_DESCRIPTION, | |
| citation=_CITATION, | |
| inputs_description=_KWARGS_DESCRIPTION, | |
| features=[ | |
| datasets.Features( | |
| { | |
| "predictions": datasets.Value("string", id="sequence"), | |
| "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), | |
| } | |
| ), | |
| datasets.Features( | |
| { | |
| "predictions": datasets.Value("string", id="sequence"), | |
| "references": datasets.Value("string", id="sequence"), | |
| } | |
| ), | |
| ], | |
| codebase_urls=["https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py"], | |
| reference_urls=[ | |
| "https://en.wikipedia.org/wiki/BLEU", | |
| "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213", | |
| ], | |
| ) | |
| def _compute(self, predictions, references, tokenizer=Tokenizer13a(), max_order=4, smooth=False): | |
| # if only one reference is provided make sure we still use list of lists | |
| if isinstance(references[0], str): | |
| references = [[ref] for ref in references] | |
| references = [[tokenizer(r) for r in ref] for ref in references] | |
| predictions = [tokenizer(p) for p in predictions] | |
| score = compute_bleu( | |
| reference_corpus=references, translation_corpus=predictions, max_order=max_order, smooth=smooth | |
| ) | |
| (bleu, precisions, bp, ratio, translation_length, reference_length) = score | |
| return { | |
| "bleu": bleu, | |
| "precisions": precisions, | |
| "brevity_penalty": bp, | |
| "length_ratio": ratio, | |
| "translation_length": translation_length, | |
| "reference_length": reference_length, | |
| } | |