Spaces:

ner4archives
/

NER4Archives-analytics

Sleeping

File size: 7,269 Bytes

# -*- coding:utf-8 -*-

import io

import pandas
import streamlit as st
from pycaprio import Pycaprio, mappings
from zipfile import ZipFile
from requests.exceptions import JSONDecodeError

from n4a_analytics_lib.analytics import (GlobalStatistics,
                                         IaaStatistics)
from n4a_analytics_lib.constants import KAPPA_LEGEND


@st.cache
def convert_df(df_ex: pandas.DataFrame) -> bytes:
    return df_ex.to_csv(encoding="utf-8").encode('utf-8')


def check_login(username: str, password: str) -> bool:
    if (len(username) == 0) or (len(password) == 0):
        return False
    return True


def display_data(col: st.columns) -> None:
    col.metric("Total curated annotations",
                f"{st.session_state['gs_obj'].total_annotations_project} Named entities")
    col.dataframe(st.session_state['gs_obj'].df_i)
    selected_data = col.selectbox('Select specific data to display bar plot:',
                                   st.session_state['gs_obj'].documents, key="selector_data")
    col.pyplot(st.session_state['gs_obj'].create_plot(selected_data))


def template_agreement_dataframe(title: str,
                                 df: pandas.DataFrame,
                                 total_pov: int,
                                 total_annotations: int,
                                 percentage_pov: float,
                                 mode: str) -> None:
    st.subheader(title)
    st.markdown(f"{total_pov} / {total_annotations} annotations ({percentage_pov} %)")
    st.download_button(
            "Press to Download CSV",
            convert_df(df),
            f"csv_annotators_{mode}.csv",
            "text/csv",
            key=f'download-csv_{mode}'
        )
    st.dataframe(df)


def init_session_iaa(data: st.file_uploader,
                     baseline: st.file_uploader,
                     col: st.columns) -> None:
    project_analyzed = IaaStatistics(zip_project=data, baseline_text=baseline.getvalue())
    baseline_analyzer = project_analyzed.analyze_text()

    col.markdown(f"""
           ### BASELINE TEXT: {baseline.name}

            - sentences:  {baseline_analyzer[0]}
            - words: {baseline_analyzer[1]}
            - characters: {baseline_analyzer[2]}
           """)

    st.markdown("## 📈 IAA metrics")
    col1_kappa, col2_kappa = st.columns(2)

    # Display Kappa group
    col1_kappa.subheader("Fleiss Kappa (global score for group):")
    col1_kappa.markdown(interpret_kappa(project_analyzed.fleiss_kappa), unsafe_allow_html=True)

    # Display pairs kappa
    col1_kappa.subheader("Cohen Kappa (score for annotators pair):")
    for coders, c_k in project_analyzed.compute_pairs_cohen_kappa().items():
        col1_kappa.markdown(f"* {coders[0]} <> {coders[1]} : {interpret_kappa(c_k)}", unsafe_allow_html=True)

    # Display Kappa legend
    col2_kappa.markdown(KAPPA_LEGEND, unsafe_allow_html=True)

    # Plot confusion matrix
    if st.checkbox('Display confusion matrix'):
        width = st.slider("matrix width", 1, 10, 14)
        height = st.slider("matrix height", 1, 10, 4)
        st.pyplot(project_analyzed.plot_confusion_matrix(width=width, height=height).figure)

    # Agree CSV
    template_agreement_dataframe(title="✅️ Agree annotations",
                                 df=project_analyzed.df_agree,
                                 total_pov=project_analyzed.total_agree,
                                 total_annotations=project_analyzed.total_annotations,
                                 percentage_pov=project_analyzed.percent_agree,
                                 mode="agree")
    # Disagree CSV
    template_agreement_dataframe(title="❌ Disagree annotations",
                                 df=project_analyzed.df_disagree,
                                 total_pov=project_analyzed.total_disagree,
                                 total_annotations=project_analyzed.total_annotations,
                                 percentage_pov=project_analyzed.percent_disagree,
                                 mode="disagree")
    # Pie plot
    st.subheader("🏷️ Global Labels Statistics")
    st.pyplot(project_analyzed.plot_agreement_pies().figure)


def init_session_statistics(remote: bool, local: bool, data: tuple) -> None:
    # clear session
    st.session_state = {}

    # create a session variable
    st.session_state["gs_local"] = local
    st.session_state["gs_remote"] = remote

    # create a new object:
    # if remote fetch data from API Host first
    if remote and not(local):
        st.success('Fetch curated documents from host INCEpTION API in progress...')
        try:
            fetch_curated_data_from_remote(
                username=data[0],
                password=data[1]
        )
        except JSONDecodeError:
            # username / password incorrect
            st.error('Username or Password is incorrect please retry.')
            st.session_state = {}

    if local and not(remote):
        st.session_state["gs_obj"] = GlobalStatistics(zip_project=data, remote=False)


def fetch_curated_data_from_remote(username: str,
                                   password: str,
                                   endpoint: str = "https://inception.dhlab.epfl.ch/prod",
                                   project_title: str = "ner4archives-template") -> None:
    # open a client
    client = Pycaprio(inception_host=endpoint, authentication=(str(username), str(password)))

    # get project object
    project_name = [p for p in client.api.projects() if p.project_name == project_title]

    # get all documents from project
    documents = client.api.documents(project_name[0].project_id)

    curations = []
    zipfiles = []
    count = 0
    flag = "a"
    # iterate over all documents and retrieve only curated into ZIP container
    for document in documents:
        if count > 0:
            flag = "r"
        if document.document_state == mappings.DocumentState.CURATION_COMPLETE:
            curated_content = client.api.curation(project_name[0].project_id, document,
                                                  curation_format=mappings.InceptionFormat.UIMA_CAS_XMI_XML_1_1)
            curations.append(curated_content)
            for curation in curations:
                z = ZipFile(io.BytesIO(curation), mode=flag)
                zipfiles.append(z)

        count += 1

    # Merge all zip in one
    with zipfiles[0] as z1:
        for fname in zipfiles[1:]:
            zf = fname
            # print(zf.namelist())
            for n in zf.namelist():
                if n not in z1.namelist():
                    z1.writestr(n, zf.open(n).read())

        # Create a new object
        st.session_state["gs_obj"] = GlobalStatistics(zip_project=z1, remote=True)


def interpret_kappa(score: float) -> str:
    color = ""
    if score < 0:
        color= "#e74c3c;"
    elif 0.01 <= score <= 0.20:
        color= "#f39c12;"
    elif 0.21 <= score <= 0.40:
        color= "#f4d03f;"
    elif 0.41 <= score <= 0.60:
        color= "#5dade2;"
    elif 0.61 <= score <= 0.80:
        color= "#58d68d;"
    elif 0.81 <= score <= 0.99:
        color= "#28b463;"

    return f"<span style='font-size:30px; color: {color}'>{round(score*100, 2)} %</span>"