nlpblogs's picture
Update app.py
77bbf28 verified
raw
history blame
4.01 kB
import streamlit as st
import pandas as pd
from streamlit_extras.stylable_container import stylable_container
import time
import zipfile
import io
import nltk
nltk.download('punkt_tab')
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re
with st.sidebar:
with stylable_container(
key="test_button",
css_styles="""
button {
background-color: #0000ff;
border: none;
color: white;
}
""",
):
st.button("DEMO APP")
st.subheader("Glossary of tags", divider = "red")
per = st.checkbox("PER")
if per:
st.write("Person's name")
org = st.checkbox("ORG")
if org:
st.write("Organization")
loc = st.checkbox("LOC")
if loc:
st.write("Location")
misc = st.checkbox("MISC")
if misc:
st.write("Miscellaneous")
with st.expander("entity_group"):
st.write('''
This is the tag that has been assigned to an entity.
''')
with st.expander("score"):
st.write('''
This indicates the confidence level that a tag has been assigned to an entity.
''')
with st.expander("word"):
st.write('''
This is the entity that has been extracted from your text data.
''')
with st.expander("start-end"):
st.write('''
This indicates the postiion of the entity in your text data.
''')
with st.expander("Limitations and Bias"):
st.write('''
The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts.
''')
st.subheader(":blue[AI Entity Extractor]")
st.write("made by [nlpblogs](https://nlpblogs.com/)")
st.write("Apache 2.0")
st.divider()
def clear_text():
st.session_state["text"] = ""
text = st.text_area("Paste your text here and then press **Ctrl + Enter**. The length of your text should not exceed 1000 words.", key="text")
st.button("Clear text", on_click=clear_text)
st.write(text)
from nltk.tokenize import word_tokenize
text1 = re.sub(r'[^\w\s]','',text)
tokens = word_tokenize(text1)
st.write("Length", len(tokens))
st.divider()
number = 1000
if text is not None and len(tokens) > number:
st.warning('The length of your text should not exceed 1000 words.')
st.stop()
import time
with st.spinner('Wait for it...'):
time.sleep(5)
if text is not None:
token_classifier = pipeline(model="dslim/bert-base-NER", aggregation_strategy="simple")
tokens = token_classifier(text)
df = pd.DataFrame(tokens)
import zipfile
import io
dfa = pd.DataFrame(
data = {
'PER': ['Person'],
'ORG': ['Organization'],
'LOC': ['Location'],
'MISC': ['Miscellaneous'],
}
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "x") as myzip:
if text is not None:
myzip.writestr("Summary of the results.csv", df.to_csv())
myzip.writestr("Glossary of tags.csv", dfa.to_csv())
tab1, tab2 = st.tabs(["Summarize", "Download"])
with tab1:
if text is not None:
st.dataframe(df, width = 1000)
with tab2:
st.download_button(
label = "Download zip file",
data=buf.getvalue(),
file_name="zip file.zip",
mime="application/zip",
)
with st.expander("Limitations and Bias"):
st.write('''
The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts.
''')