File size: 7,294 Bytes
519b419
 
74e2066
 
 
519b419
 
74e2066
519b419
74e2066
519b419
 
 
 
 
 
74e2066
519b419
 
 
8c99444
 
519b419
 
 
 
 
 
 
74e2066
519b419
 
 
 
 
 
 
 
 
 
 
 
 
74e2066
 
519b419
 
 
 
 
74e2066
 
 
 
519b419
74e2066
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519b419
 
74e2066
519b419
 
 
 
 
 
 
74e2066
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519b419
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# -*- coding:utf-8 -*-

from itertools import combinations
from collections import defaultdict, Counter

import pandas as pd
import seaborn as sns
import matplotlib as plt

plt.use('Agg')

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

from n4a_analytics_lib.project import Project
from n4a_analytics_lib.metrics_utils import (fleiss_kappa_function, cohen_kappa_function, percentage_agreement_pov)


class GlobalStatistics(Project):
    def __init__(self, zip_project, remote=False):
        super().__init__(zip_project=zip_project, remote=remote, type="global")
        self.data = [(src_file, ne_label) for src_file, ann in self.annotations.items() for ne_label in ann['labels']]
        self.df_base = pd.DataFrame(self.data, columns=["SOURCE_FILE", "LABEL"])
        self.df_i = self.df_base.groupby(["LABEL"])["LABEL"].count().reset_index(name="TOTAL")
        self.df_details = self.df_base.groupby(["SOURCE_FILE", "LABEL"])["LABEL"].count().reset_index(name="TOTAL")

        self.total_annotations_project = self.df_i['TOTAL'].sum()

    def create_plot(self, type_data: str) -> sns.barplot:
        # apply data filter
        data_tab_filtered = self.df_details.loc[self.df_details['SOURCE_FILE'] == type_data]
        # create a new plot
        ax = sns.barplot(x='LABEL', y='TOTAL', data=data_tab_filtered)
        # add title to plot
        ax.figure.suptitle(type_data)
        # add value labels to bars
        for container in ax.containers:
            ax.bar_label(container)
        return ax.figure


class IaaStatistics(Project):
    def __init__(self, zip_project, baseline_text, remote=False):
        super().__init__(zip_project=zip_project, remote=remote, type="iaa")
        self.baseline_text = baseline_text.decode('utf-8')

        self.mentions_per_coder = self.extract_refs(self.annotations, self.annotators, type="mentions")
        self.labels_per_coder = self.extract_refs(self.annotations, self.annotators, type="labels")
        self.annotations_per_coders = {coder: dict(zip(ann[1]['mentions'], ann[1]['labels'])) for coder, ann in zip(self.annotators, self.annotations.items())}
        self.coders_pairs = list(combinations(self.annotations_per_coders, 2))
        self.similar_mention = list(dict.fromkeys([l for i,j in self.mentions_per_coder.items() for l in j]))

        self.labels_schema = list(dict.fromkeys([label for _, labels in self.labels_per_coder.items() for label in labels]))

        # dataframes and matrix analysis
        self.base_df = self.build_base_df()
        self.df_agree = self.base_df [self.base_df[self.annotators].apply(lambda row: self.check_all_equal(row), axis=1)]
        self.df_disagree = self.base_df[self.base_df[self.annotators].apply(lambda row: self.check_all_not_equal(row), axis=1)]
        self.coders_matrix = self.base_df.apply(pd.Series.value_counts, 1).fillna(0).astype(int).values

        # totals
        self.total_annotations = len(self.base_df)
        self.total_agree = len(self.df_agree)
        self.total_disagree = len(self.df_disagree)

        # access to metrics
        self.fleiss_kappa = round(fleiss_kappa_function(self.coders_matrix), 2)
        self.cohen_kappa_pairs = self.compute_pairs_cohen_kappa()
        self.percent_agree = percentage_agreement_pov(self.total_agree, self.total_annotations)
        self.percent_disagree = percentage_agreement_pov(self.total_disagree, self.total_annotations)

    @staticmethod
    def extract_refs(annotations: dict, annotators: list, type: str) -> dict:
        return {
            coder: data for coder, ann in zip(
                annotators,
                annotations.items()
            ) for ref, data in ann[1].items() if ref == type
        }

    @staticmethod
    def check_all_equal(iterator: list) -> bool:
        return len(set(iterator)) <= 1

    @staticmethod
    def check_all_not_equal(iterator: list) -> bool:
        return len(set(iterator)) > 1

    def plot_confusion_matrix(self, width: int, height: int) -> plt.pyplot.subplots:
        intermediary = defaultdict(Counter)
        for (src, tgt), count in self.cohen_kappa_pairs.items():
            intermediary[src][tgt] = count

        letters = sorted({key for inner in intermediary.values() for key in inner} | set(intermediary.keys()))

        confusion_matrix = [[intermediary[src][tgt] for tgt in letters] for src in letters]

        df_cm = pd.DataFrame(confusion_matrix, letters, letters)
        mask = df_cm.values == 0
        sns.set(font_scale=0.7)  # for label size
        colors = ["#e74c3c", "#f39c12", "#f4d03f", "#5dade2", "#58d68d", "#28b463"]

        fig, ax = plt.pyplot.subplots(figsize=(width, height))
        sns.heatmap(df_cm, cmap=colors, annot=True, mask=mask, annot_kws={"size": 7}, vmin=0, vmax=1, ax=ax)  # font size
        return ax

    def build_base_df(self) -> pd.DataFrame:
        df = pd.DataFrame(self.annotations_per_coders, index=self.similar_mention)
        for ann in self.annotators:
            df[ann] = 'None'
            for mention, value in self.annotations_per_coders[ann].items():
                df.loc[mention, ann] = value
        return df

    def compute_pairs_cohen_kappa(self) -> dict:
        return {
            (c1, c2): cohen_kappa_function(self.labels_per_coder[c1],
                                           self.labels_per_coder[c2]) for c1, c2 in self.coders_pairs
        }

    def count_total_annotations_label(self) -> list:
        return [
            (label, self.base_df.astype(object).eq(label).any(1).sum()) for label in self.labels_schema
        ]

    def total_agree_disagree_per_label(self) -> list:
        # t[0] : label
        # t[1] : total_rows_with_label
        return [(
            t[0],
            t[1],
            (self.base_df[self.base_df.nunique(1).eq(1)].eq(t[0]).any(1).sum() / t[1]) * 100,
            ((t[1] - self.base_df[self.base_df.nunique(1).eq(1)].eq(t[0]).any(1).sum()) / t[1]) * 100
        )
            for t in self.count_total_annotations_label()]

    def plot_agreement_pies(self) -> plt.pyplot.subplots:
        my_labels = 'agree', 'disagree'
        my_colors = ['#47DBCD', '#F5B14C']
        my_explode = (0, 0.1)
        counter = 0
        tasks_to_pie = self.total_agree_disagree_per_label()
        fig, axes = plt.pyplot.subplots(1, len(tasks_to_pie), figsize=(20, 3))
        for t in tasks_to_pie:
            tasks = [t[2], t[3]]
            axes[counter].pie(tasks, autopct='%1.1f%%', startangle=15, shadow=True, colors=my_colors,
                              explode=my_explode)
            axes[counter].set_title(t[0])
            axes[counter].axis('equal')
            counter += 1
        fig.set_facecolor("white")
        fig.legend(labels=my_labels, loc="center right", borderaxespad=0.1, title="Labels alignement")
        # plt.savefig(f'./out/pie_alignement_labels_{filename_no_extension}.png', dpi=400)
        return fig

    def analyze_text(self) -> list:
        """returns total sentences, words and characters
        in list format
        """
        return [
            len(sent_tokenize(self.baseline_text, language="french")),
            len(word_tokenize(self.baseline_text, language="french")),
            len(self.baseline_text)
        ]