import streamlit as st
import pandas as pd
from streamlit_extras.stylable_container import stylable_container
import time
import zipfile
import io
import nltk
nltk.download('punkt_tab')
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re


with st.sidebar:
  with stylable_container(
    key="test_button",
    
    css_styles="""
        button { 
            background-color: #0000ff;
            border: none;
            color: white;
        }
        """,
    ):
        st.button("DEMO APP")
        

  st.subheader("Glossary of tags", divider = "red")
  

  per = st.checkbox("I")
  if per:
    st.write("Person's name")
      
  org = st.checkbox("ORG")
  if org:
    st.write("Organization")
      
  loc = st.checkbox("LOC")
  if loc:
    st.write("Location")
      
  PER = st.checkbox("B-PER")
  if PER:
    st.write("Beginning of a person’s name right after another person’s name")
      
  ORG = st.checkbox("B-ORG")
  if ORG:
    st.write("Beginning of an organisation right after another organization")
      
  LOC = st.checkbox("B-LOC")
  if LOC:
    st.write("Beginning of a location right after another location")
       
  O = st.checkbox("O")
  if O:
    st.write("Outside of a named entity")
            

st.subheader(":blue[AI Entity Extractor]")

st.divider()


def clear_text():
    st.session_state["text"] = ""

text = st.text_input("Paste your text here and then press **enter**. The length of your text should not exceed 2000 words.", key="text")    
st.button("Clear text", on_click=clear_text)
st.write(text)


from nltk.tokenize import word_tokenize

text1 = re.sub(r'[^\w\s]','',text)
tokens = word_tokenize(text1)
st.write("Length", len(tokens))
st.divider()

number = 2000

if text is not None and len(tokens) > number:
  st.warning('The length of your text should not exceed 2000 words.')
  st.stop()


if text is not None:
    token_classifier = pipeline(model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple")
    
    tokens = token_classifier(text)
    
    df = pd.DataFrame(tokens)
    df = df.drop(df[df['word'] == '##s'].index)


import zipfile
import io

dfa = pd.DataFrame(
       data = {
           'I': ['Person'],
           'ORG': ['Organization'],
           'LOC': ['Location'],
           'B-PER': ['Beginning of a person’s name right after another person’s name'],
           'B-ORG': ['Beginning of an organisation right after another organization '],
           'B-LOC': ['Beginning of a location right after another location'],
           'O': ['Outside of a named entity ']
        
        
        }
    )


buf = io.BytesIO()

with zipfile.ZipFile(buf, "x") as myzip:
    if text is not None:
        myzip.writestr("Summary of the results.csv", df.to_csv())
        
        myzip.writestr("Glossary of tags.csv", dfa.to_csv())
  

tab1, tab2 = st.tabs(["Summarize", "Download"])


with tab1:
    if text is not None:
        st.dataframe(df, width = 1000)


with tab2:
  st.download_button(
    label = "Download zip file",
    data=buf.getvalue(),
    file_name="zip file.zip",
    mime="application/zip",
)