Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
from streamlit_extras.stylable_container import stylable_container | |
import time | |
import zipfile | |
import io | |
import nltk | |
nltk.download('punkt_tab') | |
from transformers import AutoTokenizer, AutoModelForTokenClassification | |
from transformers import pipeline | |
import re | |
with st.sidebar: | |
with stylable_container( | |
key="test_button", | |
css_styles=""" | |
button { | |
background-color: #0000ff; | |
border: none; | |
color: white; | |
} | |
""", | |
): | |
st.button("DEMO APP") | |
st.subheader("Glossary of tags", divider = "red") | |
with st.expander("PER"): | |
st.write(''' | |
Person's name | |
''') | |
with st.expander("ORG"): | |
st.write(''' | |
Organization name | |
''') | |
with st.expander("LOC"): | |
st.write(''' | |
Location name | |
''') | |
with st.expander("MISC"): | |
st.write(''' | |
Miscellaneous | |
''') | |
with st.expander("entity_group"): | |
st.write(''' | |
This is the tag that has been assigned to an entity. | |
''') | |
with st.expander("score"): | |
st.write(''' | |
This indicates the confidence level that a tag has been assigned to an entity. | |
''') | |
with st.expander("word"): | |
st.write(''' | |
This is the entity that has been extracted from your text data. | |
''') | |
with st.expander("start"): | |
st.write(''' | |
This is the index of the first character of the entity in your text data. | |
''') | |
with st.expander("end"): | |
st.write(''' | |
This is the index of the character immediately after the last character of the entity. | |
''') | |
st.subheader(":blue[AI Entity Extractor]") | |
st.write("made by [nlpblogs](https://nlpblogs.com/)") | |
st.write("Apache 2.0") | |
st.divider() | |
def clear_text(): | |
st.session_state["text"] = "" | |
text = st.text_area("Paste your text here and then press **Ctrl + Enter**. The length of your text should not exceed 1000 words.", key="text") | |
st.button("Clear text", on_click=clear_text) | |
st.write(text) | |
from nltk.tokenize import word_tokenize | |
text1 = re.sub(r'[^\w\s]','',text) | |
tokens = word_tokenize(text1) | |
st.write("Length", len(tokens)) | |
st.divider() | |
number = 1000 | |
if text is not None and len(tokens) > number: | |
st.warning('The length of your text should not exceed 1000 words.') | |
st.stop() | |
import time | |
with st.spinner('Wait for it...'): | |
time.sleep(5) | |
if text is not None: | |
token_classifier = pipeline(model="dslim/bert-base-NER", aggregation_strategy="simple") | |
tokens = token_classifier(text) | |
df = pd.DataFrame(tokens) | |
import zipfile | |
import io | |
dfa = pd.DataFrame( | |
data = { | |
'PER': ['Person'], | |
'ORG': ['Organization'], | |
'LOC': ['Location'], | |
'MISC': ['Miscellaneous'], | |
} | |
) | |
buf = io.BytesIO() | |
with zipfile.ZipFile(buf, "x") as myzip: | |
if text is not None: | |
myzip.writestr("Summary of the results.csv", df.to_csv()) | |
myzip.writestr("Glossary of tags.csv", dfa.to_csv()) | |
tab1, tab2 = st.tabs(["Summarize", "Download"]) | |
with tab1: | |
if text is not None: | |
st.dataframe(df, width = 1000) | |
with tab2: | |
st.download_button( | |
label = "Download zip file", | |
data=buf.getvalue(), | |
file_name="zip file.zip", | |
mime="application/zip", | |
) | |
with st.expander("Limitations and Bias"): | |
st.write(''' | |
The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts. | |
''') | |