import streamlit as st import pandas as pd from streamlit_extras.stylable_container import stylable_container import time import zipfile import io import nltk nltk.download('punkt_tab') from transformers import AutoTokenizer, AutoModelForTokenClassification from transformers import pipeline import re with st.sidebar: with stylable_container( key="test_button", css_styles=""" button { background-color: #0000ff; border: none; color: white; } """, ): st.button("DEMO APP") st.subheader("Glossary", divider = "red") with st.expander("PER"): st.write(''' Person's name ''') with st.expander("ORG"): st.write(''' Organization name ''') with st.expander("LOC"): st.write(''' Location name ''') with st.expander("MISC"): st.write(''' Miscellaneous ''') with st.expander("entity_group"): st.write(''' This is the tag that has been assigned to an entity. ''') with st.expander("score"): st.write(''' This indicates the confidence level that a tag has been assigned to an entity. ''') with st.expander("word"): st.write(''' This is the entity that has been extracted from your text data. ''') with st.expander("start"): st.write(''' This is the index of the first character of the entity in your text data. ''') with st.expander("end"): st.write(''' This is the index of the character immediately after the last character of the entity. ''') st.subheader(":blue[AI Entity Extractor]") st.write("made by [nlpblogs](https://nlpblogs.com/)") st.write("Apache 2.0") st.divider() def clear_text(): st.session_state["text"] = "" text = st.text_area("Paste your text here and then press **Ctrl + Enter**. The length of your text should not exceed 1000 words.", key="text") st.button("Clear text", on_click=clear_text) st.write(text) from nltk.tokenize import word_tokenize text1 = re.sub(r'[^\w\s]','',text) tokens = word_tokenize(text1) st.write("Length", len(tokens)) st.divider() number = 1000 if text is not None and len(tokens) > number: st.warning('The length of your text should not exceed 1000 words.') st.stop() import time with st.spinner('Wait for it...'): time.sleep(5) if text is not None: token_classifier = pipeline(model="dslim/bert-base-NER", aggregation_strategy="simple") tokens = token_classifier(text) df = pd.DataFrame(tokens) import zipfile import io dfa = pd.DataFrame( data = { 'PER': ['Person'], 'ORG': ['Organization'], 'LOC': ['Location'], 'MISC': ['Miscellaneous'], } ) buf = io.BytesIO() with zipfile.ZipFile(buf, "x") as myzip: if text is not None: myzip.writestr("Summary of the results.csv", df.to_csv()) myzip.writestr("Glossary of tags.csv", dfa.to_csv()) tab1, tab2 = st.tabs(["Summarize", "Download"]) with tab1: if text is not None: st.dataframe(df, width = 1000) with tab2: st.download_button( label = "Download zip file", data=buf.getvalue(), file_name="zip file.zip", mime="application/zip", ) with st.expander("Limitations and Bias"): st.write(''' The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts. ''')