nlpblogs's picture
Update app.py
8cae1d5 verified
raw
history blame
3.79 kB
import streamlit as st
import pandas as pd
from streamlit_extras.stylable_container import stylable_container
import time
import zipfile
import io
import nltk
nltk.download('punkt_tab')
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re
with st.sidebar:
with stylable_container(
key="test_button",
css_styles="""
button {
background-color: #0000ff;
border: none;
color: white;
}
""",
):
st.button("DEMO APP")
st.subheader("Glossary", divider = "red")
with st.expander("PER"):
st.write('''
Person's name
''')
with st.expander("ORG"):
st.write('''
Organization name
''')
with st.expander("LOC"):
st.write('''
Location name
''')
with st.expander("MISC"):
st.write('''
Miscellaneous
''')
with st.expander("entity_group"):
st.write('''
This is the tag that has been assigned to an entity.
''')
with st.expander("score"):
st.write('''
This indicates the confidence level that a tag has been assigned to an entity.
''')
with st.expander("word"):
st.write('''
This is the entity that has been extracted from your text data.
''')
with st.expander("start"):
st.write('''
This is the index of the first character of the entity in your text data.
''')
with st.expander("end"):
st.write('''
This is the index of the character immediately after the last character of the entity.
''')
st.subheader(":blue[AI Entity Extractor]")
st.write("made by [nlpblogs](https://nlpblogs.com/)")
st.write("Apache 2.0")
st.divider()
def clear_text():
st.session_state["text"] = ""
text = st.text_area("Paste your text here and then press **Ctrl + Enter**. The length of your text should not exceed 1000 words.", key="text")
st.button("Clear text", on_click=clear_text)
st.write(text)
from nltk.tokenize import word_tokenize
text1 = re.sub(r'[^\w\s]','',text)
tokens = word_tokenize(text1)
st.write("Length", len(tokens))
st.divider()
number = 1000
if text is not None and len(tokens) > number:
st.warning('The length of your text should not exceed 1000 words.')
st.stop()
import time
with st.spinner('Wait for it...'):
time.sleep(5)
if text is not None:
token_classifier = pipeline(model="dslim/bert-base-NER", aggregation_strategy="simple")
tokens = token_classifier(text)
df = pd.DataFrame(tokens)
import zipfile
import io
dfa = pd.DataFrame(
data = {
'PER': ['Person'],
'ORG': ['Organization'],
'LOC': ['Location'],
'MISC': ['Miscellaneous'],
}
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "x") as myzip:
if text is not None:
myzip.writestr("Summary of the results.csv", df.to_csv())
myzip.writestr("Glossary of tags.csv", dfa.to_csv())
tab1, tab2 = st.tabs(["Summarize", "Download"])
with tab1:
if text is not None:
st.dataframe(df, width = 1000)
with tab2:
st.download_button(
label = "Download zip file",
data=buf.getvalue(),
file_name="zip file.zip",
mime="application/zip",
)
with st.expander("Limitations and Bias"):
st.write('''
The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts.
''')