import streamlit as st
import pandas as pd
from streamlit_extras.stylable_container import stylable_container
import time
import zipfile
import io
import nltk
nltk.download('punkt_tab')
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re


with st.sidebar:
  with stylable_container(
    key="test_button",
    
    css_styles="""
        button { 
            background-color: #0000ff;
            border: none;
            color: white;
        }
        """,
    ):
        st.button("DEMO APP")
        

  st.subheader("Glossary", divider = "red")
  

  with st.expander("PER"):
    st.write('''
        Person's name
    ''')

  with st.expander("ORG"):
    st.write('''
        Organization name
    ''')

  with st.expander("LOC"):
    st.write('''
        Location name
    ''')

  with st.expander("MISC"):
    st.write('''
        Miscellaneous 
    ''')

  with st.expander("entity_group"):
    st.write('''
        This is the tag that has been assigned to an entity.
    ''')

  with st.expander("score"):
    st.write('''
        This indicates the confidence level that a tag has been assigned to an entity.
    ''')

  with st.expander("word"):
    st.write('''
        This is the entity that has been extracted from your text data.
    ''')

  with st.expander("start"):
    st.write('''
        This is the index of the first character of the entity in your text data.
    ''')

  with st.expander("end"):
    st.write('''
        This is the index of the character immediately after the last character of the entity.
    ''')
    
  
st.subheader(":blue[AI Entity Extractor]")
st.write("made by [nlpblogs](https://nlpblogs.com/)")
st.write("Apache 2.0")

st.divider()


def clear_text():
    st.session_state["text"] = ""

text = st.text_area("Paste your text here and then press **Ctrl + Enter**. The length of your text should not exceed 1000 words.", key="text")    
st.button("Clear text", on_click=clear_text)
st.write(text)


from nltk.tokenize import word_tokenize

text1 = re.sub(r'[^\w\s]','',text)
tokens = word_tokenize(text1)
st.write("Length", len(tokens))
st.divider()

number = 1000

if text is not None and len(tokens) > number:
  st.warning('The length of your text should not exceed 1000 words.')
  st.stop()


import time
with st.spinner('Wait for it...'):
    time.sleep(5)
    if text is not None:
        token_classifier = pipeline(model="dslim/bert-base-NER", aggregation_strategy="simple")
        tokens = token_classifier(text)
        df = pd.DataFrame(tokens)
    

import zipfile
import io

dfa = pd.DataFrame(
       data = {
           'PER': ['Person'],
           'ORG': ['Organization'],
           'LOC': ['Location'],
           'MISC': ['Miscellaneous'],
           
        
        }
    )


buf = io.BytesIO()

with zipfile.ZipFile(buf, "x") as myzip:
    if text is not None:
        myzip.writestr("Summary of the results.csv", df.to_csv())
        
        myzip.writestr("Glossary of tags.csv", dfa.to_csv())
  

tab1, tab2 = st.tabs(["Summarize", "Download"])


with tab1:
    if text is not None:
        st.dataframe(df, width = 1000)


with tab2:
  st.download_button(
    label = "Download zip file",
    data=buf.getvalue(),
    file_name="zip file.zip",
    mime="application/zip",
)

  
with st.expander("Limitations and Bias"):
    st.write('''
        The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts.
    ''')