File size: 7,269 Bytes
519b419
 
74e2066
 
 
519b419
74e2066
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519b419
 
74e2066
 
 
 
 
 
 
 
 
 
 
 
519b419
74e2066
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# -*- coding:utf-8 -*-

import io

import pandas
import streamlit as st
from pycaprio import Pycaprio, mappings
from zipfile import ZipFile
from requests.exceptions import JSONDecodeError

from n4a_analytics_lib.analytics import (GlobalStatistics,
                                         IaaStatistics)
from n4a_analytics_lib.constants import KAPPA_LEGEND


@st.cache
def convert_df(df_ex: pandas.DataFrame) -> bytes:
    return df_ex.to_csv(encoding="utf-8").encode('utf-8')


def check_login(username: str, password: str) -> bool:
    if (len(username) == 0) or (len(password) == 0):
        return False
    return True


def display_data(col: st.columns) -> None:
    col.metric("Total curated annotations",
                f"{st.session_state['gs_obj'].total_annotations_project} Named entities")
    col.dataframe(st.session_state['gs_obj'].df_i)
    selected_data = col.selectbox('Select specific data to display bar plot:',
                                   st.session_state['gs_obj'].documents, key="selector_data")
    col.pyplot(st.session_state['gs_obj'].create_plot(selected_data))


def template_agreement_dataframe(title: str,
                                 df: pandas.DataFrame,
                                 total_pov: int,
                                 total_annotations: int,
                                 percentage_pov: float,
                                 mode: str) -> None:
    st.subheader(title)
    st.markdown(f"{total_pov} / {total_annotations} annotations ({percentage_pov} %)")
    st.download_button(
            "Press to Download CSV",
            convert_df(df),
            f"csv_annotators_{mode}.csv",
            "text/csv",
            key=f'download-csv_{mode}'
        )
    st.dataframe(df)


def init_session_iaa(data: st.file_uploader,
                     baseline: st.file_uploader,
                     col: st.columns) -> None:
    project_analyzed = IaaStatistics(zip_project=data, baseline_text=baseline.getvalue())
    baseline_analyzer = project_analyzed.analyze_text()

    col.markdown(f"""
           ### BASELINE TEXT: {baseline.name}

            - sentences:  {baseline_analyzer[0]}
            - words: {baseline_analyzer[1]}
            - characters: {baseline_analyzer[2]}
           """)

    st.markdown("## πŸ“ˆ IAA metrics")
    col1_kappa, col2_kappa = st.columns(2)

    # Display Kappa group
    col1_kappa.subheader("Fleiss Kappa (global score for group):")
    col1_kappa.markdown(interpret_kappa(project_analyzed.fleiss_kappa), unsafe_allow_html=True)

    # Display pairs kappa
    col1_kappa.subheader("Cohen Kappa (score for annotators pair):")
    for coders, c_k in project_analyzed.compute_pairs_cohen_kappa().items():
        col1_kappa.markdown(f"* {coders[0]} <> {coders[1]} : {interpret_kappa(c_k)}", unsafe_allow_html=True)

    # Display Kappa legend
    col2_kappa.markdown(KAPPA_LEGEND, unsafe_allow_html=True)

    # Plot confusion matrix
    if st.checkbox('Display confusion matrix'):
        width = st.slider("matrix width", 1, 10, 14)
        height = st.slider("matrix height", 1, 10, 4)
        st.pyplot(project_analyzed.plot_confusion_matrix(width=width, height=height).figure)

    # Agree CSV
    template_agreement_dataframe(title="βœ…οΈ Agree annotations",
                                 df=project_analyzed.df_agree,
                                 total_pov=project_analyzed.total_agree,
                                 total_annotations=project_analyzed.total_annotations,
                                 percentage_pov=project_analyzed.percent_agree,
                                 mode="agree")
    # Disagree CSV
    template_agreement_dataframe(title="❌ Disagree annotations",
                                 df=project_analyzed.df_disagree,
                                 total_pov=project_analyzed.total_disagree,
                                 total_annotations=project_analyzed.total_annotations,
                                 percentage_pov=project_analyzed.percent_disagree,
                                 mode="disagree")
    # Pie plot
    st.subheader("🏷️ Global Labels Statistics")
    st.pyplot(project_analyzed.plot_agreement_pies().figure)


def init_session_statistics(remote: bool, local: bool, data: tuple) -> None:
    # clear session
    st.session_state = {}

    # create a session variable
    st.session_state["gs_local"] = local
    st.session_state["gs_remote"] = remote

    # create a new object:
    # if remote fetch data from API Host first
    if remote and not(local):
        st.success('Fetch curated documents from host INCEpTION API in progress...')
        try:
            fetch_curated_data_from_remote(
                username=data[0],
                password=data[1]
        )
        except JSONDecodeError:
            # username / password incorrect
            st.error('Username or Password is incorrect please retry.')
            st.session_state = {}

    if local and not(remote):
        st.session_state["gs_obj"] = GlobalStatistics(zip_project=data, remote=False)


def fetch_curated_data_from_remote(username: str,
                                   password: str,
                                   endpoint: str = "https://inception.dhlab.epfl.ch/prod",
                                   project_title: str = "ner4archives-template") -> None:
    # open a client
    client = Pycaprio(inception_host=endpoint, authentication=(str(username), str(password)))

    # get project object
    project_name = [p for p in client.api.projects() if p.project_name == project_title]

    # get all documents from project
    documents = client.api.documents(project_name[0].project_id)

    curations = []
    zipfiles = []
    count = 0
    flag = "a"
    # iterate over all documents and retrieve only curated into ZIP container
    for document in documents:
        if count > 0:
            flag = "r"
        if document.document_state == mappings.DocumentState.CURATION_COMPLETE:
            curated_content = client.api.curation(project_name[0].project_id, document,
                                                  curation_format=mappings.InceptionFormat.UIMA_CAS_XMI_XML_1_1)
            curations.append(curated_content)
            for curation in curations:
                z = ZipFile(io.BytesIO(curation), mode=flag)
                zipfiles.append(z)

        count += 1

    # Merge all zip in one
    with zipfiles[0] as z1:
        for fname in zipfiles[1:]:
            zf = fname
            # print(zf.namelist())
            for n in zf.namelist():
                if n not in z1.namelist():
                    z1.writestr(n, zf.open(n).read())

        # Create a new object
        st.session_state["gs_obj"] = GlobalStatistics(zip_project=z1, remote=True)


def interpret_kappa(score: float) -> str:
    color = ""
    if score < 0:
        color= "#e74c3c;"
    elif 0.01 <= score <= 0.20:
        color= "#f39c12;"
    elif 0.21 <= score <= 0.40:
        color= "#f4d03f;"
    elif 0.41 <= score <= 0.60:
        color= "#5dade2;"
    elif 0.61 <= score <= 0.80:
        color= "#58d68d;"
    elif 0.81 <= score <= 0.99:
        color= "#28b463;"

    return f"<span style='font-size:30px; color: {color}'>{round(score*100, 2)} %</span>"