File size: 1,975 Bytes
4a1df2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""

Determine if maintaining the same predicted label
---------------------------------------------------------------------
"""


from .classification_goal_function import ClassificationGoalFunction


class InputReduction(ClassificationGoalFunction):
    """Attempts to reduce the input down to as few words as possible while
    maintaining the same predicted label.

    From Feng, Wallace, Grissom, Iyyer, Rodriguez, Boyd-Graber. (2018).
    Pathologies of Neural Models Make Interpretations Difficult.
    https://arxiv.org/abs/1804.07781
    """

    def __init__(self, *args, target_num_words=1, **kwargs):
        self.target_num_words = target_num_words
        super().__init__(*args, **kwargs)

    def _is_goal_complete(self, model_output, attacked_text):
        return (
            self.ground_truth_output == model_output.argmax()
            and attacked_text.num_words <= self.target_num_words
        )

    def _should_skip(self, model_output, attacked_text):
        return self.ground_truth_output != model_output.argmax()

    def _get_score(self, model_output, attacked_text):
        # Give the lowest score possible to inputs which don't maintain the ground truth label.
        if self.ground_truth_output != model_output.argmax():
            return 0

        cur_num_words = attacked_text.num_words
        initial_num_words = self.initial_attacked_text.num_words

        # The main goal is to reduce the number of words (num_words_score)
        # Higher model score for the ground truth label is used as a tiebreaker (model_score)
        num_words_score = max(
            (initial_num_words - cur_num_words) / initial_num_words, 0
        )
        model_score = model_output[self.ground_truth_output]
        return min(num_words_score + model_score / initial_num_words, 1)

    def extra_repr_keys(self):
        if self.maximizable:
            return ["maximizable"]
        else:
            return ["maximizable", "target_num_words"]