File size: 6,914 Bytes
2885a60
 
 
 
 
 
 
 
 
dbb453d
 
2885a60
 
dbb453d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2885a60
 
 
3f218d0
adcb90e
 
2885a60
adcb90e
3f218d0
 
 
 
 
2885a60
 
 
 
 
 
 
 
 
 
 
 
 
adcb90e
 
 
2885a60
 
adcb90e
2885a60
 
 
 
 
 
 
dbb453d
2885a60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f218d0
 
2885a60
 
 
 
 
 
 
 
dbb453d
2885a60
 
 
 
 
 
 
 
 
 
 
 
 
ac939ac
2885a60
 
 
 
 
adcb90e
 
2885a60
 
adcb90e
2885a60
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import datasets
import evaluate

from harim_scorer import Harimplus_Scorer



logger = evaluate.logging.get_logger(__name__)

CODEBASE_URL='https://huggingface.co/spaces/NCSOFT/harim_plus'
PAPER_URL='https://arxiv.org/abs/2211.12118'

_CITATION = """\
@inproceedings{son-etal-2022-harim,
    title = "{H}a{R}i{M}$^+$: Evaluating Summary Quality with Hallucination Risk",
    author = "Son, Seonil (Simon)  and
      Park, Junsoo  and
      Hwang, Jeong-in  and
      Lee, Junghwa  and
      Noh, Hyungjong  and
      Lee, Yeonsoo",
    booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing",
    month = nov,
    year = "2022",
    address = "Online only",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.aacl-main.66",
    pages = "895--924",
    abstract = "One of the challenges of developing a summarization model arises from the difficulty in measuring the factual inconsistency of the generated text. In this study, we reinterpret the decoder overconfidence-regularizing objective suggested in (Miao et al., 2021) as a hallucination risk measurement to better estimate the quality of generated summaries. We propose a reference-free metric, HaRiM+, which only requires an off-the-shelf summarization model to compute the hallucination risk based on token likelihoods. Deploying it requires no additional training of models or ad-hoc modules, which usually need alignment to human judgments. For summary-quality estimation, HaRiM+ records state-of-the-art correlation to human judgment on three summary-quality annotation sets: FRANK, QAGS, and SummEval. We hope that our work, which merits the use of summarization models, facilitates the progress of both automated evaluation and generation of summary.",
}
"""

_DESCRIPTION = f"""**HaRiM+** is a reference-less evaluation metric (i.e. requires only article-summary pair, no reference summary) for summarization which hurls the power of summarization model.
Summarization model inside the HaRiM+ will read and evaluate how good the quality of a summary given the paired article.
It will work great for ranking the summary-article pairs according to its quality.

HaRiM+ is proved effective for benchmarking summarization systems (system-level performance) as well as ranking the article-summary pairs (segment-level performance) in comprehensive aspect such as factuality, consistency, coherency, fluency, and relevance. For details, refer to our [paper]({PAPER_URL}) published in AACL2022.

NOTE that for HaRiM+...
* predictions = summaries (List[str])
* references = articles (List[str])

"""

_KWARGS_DESCRIPTION = """
HaRiM+ score.
Args:
    For scorer = evaluate.load():
    `pretrained_name` (str or pathlib.Path): summarization model checkpoint or path, loaded by transformers.AutoModelForSeq2SeqLM.from_pretrained(). Defaults to Yale-LILY/brio-cnndm-uncased.
    `tokenizer`: (use when your tokenizer cannot be loaded by from_pretrained)Tokenizer function compatible with transformers.PreTrainedTokenizer. It requires tokenizer.pad_token|eos_token|bos_token and tokenizer.__call__() method for HaRiM+ score computation.

    For scorer.compute():
    `predictions` (list of str): generated summaries
    `references` (list of str): source articles to be summarized
    `use_aggregator` (bool): if True, average of the scores are returned
    `bsz` (int): batch size for harim to iterate through the given pairs
    `return_details` (bool): whether to show more than harim+ score (returns logppl, harim term. refer to the paper for detail)
        `tokenwise_score` (bool): whether to show tokenwise scores for input pairs (if return_details=False, this is ignored)

Returns:
    'results' (list of float): harim+ score for each summary-article pair

Examples:
    >>> summaries = ["hello there", "hello there"]
    >>> articles = ["hello, this is the article to be summarized", "hello, this is the article to be summarized"]
    >>> scorer = evaluate.load("NCSOFT/harim_plus") #, pretrained_name='PRETRAINEDNAME', tokenizer=TOKENIZER # optional
    >>> results = scorer.compute(predictions=summaries, references=articles) # use_aggregator=True # optional
    >>> print([round(v, 2) for v in results["harim+"]])
    [float, float]
"""



@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Harimplus(evaluate.Metric):
    def __init__(self,
                    pretrained_name='facebook/bart-large-cnn',
                    tokenizer=None,
                    device='cuda',
                    **kwargs
                    ):
        super().__init__(**kwargs)
        self.myconfig = dict(
                            pretrained_name=pretrained_name,
                            tokenizer=tokenizer,
                            device=device,
                            )

    def _info(self):
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            homepage=CODEBASE_URL,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string", id="sequence"),
                    "references": datasets.Value("string", id="sequence"),
                }
            ),
            codebase_urls=[CODEBASE_URL],
            reference_urls=[CODEBASE_URL, PAPER_URL],
        )

    def _download_and_prepare(self, dl_manager):
        pretrained_name = self.myconfig['pretrained_name']
        is_custom_tokenizer = self.myconfig['tokenizer'] is not None
        logger.warning(
            "Loading HaRiM+ score"
            f"\tpretrained_name = {pretrained_name}"
        )
        if is_custom_tokenizer:
            logger.warning(
                f"tokenizer is overriden by \n\tself.myconfig['tokenizer']"
            )
        logger.warning(
            "You can change checkpoints with `pretrained_name` kwarg in evaluate.load. Strongly recommend to use *-large or larger ones."
            "Refrain from using checkpoints trained on noisy corpus such as bbc-XSUM.")

        # download the model checkpoint specified by self.myconfig_name and set up the scorer
        self.scorer = Harimplus_Scorer(**self.myconfig)

    def _compute(self, predictions=None,
                        references=None,
                        use_aggregator=False,
                        bsz=32,
                        tokenwise_score=False,
                        return_details=False):
        summaries = predictions
        articles = references
        scores = self.scorer.compute(predictions=summaries, references=articles,  use_aggregator=use_aggregator, bsz=bsz, tokenwise_score=tokenwise_score, return_details=return_details)
        return scores