import streamlit as st import pandas as pd from streamlit_extras.stylable_container import stylable_container import time import zipfile import io import nltk nltk.download('punkt_tab') from transformers import AutoTokenizer, AutoModelForTokenClassification from transformers import pipeline import re with st.sidebar: with stylable_container( key="test_button", css_styles=""" button { background-color: #0000ff; border: none; color: white; } """, ): st.button("DEMO APP") st.subheader("Glossary of tags", divider = "red") per = st.checkbox("PER") if per: st.write("Person's name") org = st.checkbox("ORG") if org: st.write("Organization") loc = st.checkbox("LOC") if loc: st.write("Location") misc = st.checkbox("MISC") if misc: st.write("Miscellaneous") with st.expander("entity_group"): st.write(''' This is the tag that has been assigned to an entity. ''') with st.expander("score"): st.write(''' This indicates the confidence level that a tag has been assigned to an entity. ''') with st.expander("word"): st.write(''' This is the entity that has been extracted from your text data. ''') with st.expander("start-end"): st.write(''' This indicates the postiion of the entity in your text data. ''') with st.expander("Limitations and Bias"): st.write(''' The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts. ''') st.subheader(":blue[AI Entity Extractor]") st.write("made by [nlpblogs](https://nlpblogs.com/)") st.write("Apache 2.0") st.divider() def clear_text(): st.session_state["text"] = "" text = st.text_area("Paste your text here and then press **Ctrl + Enter**. The length of your text should not exceed 1000 words.", key="text") st.button("Clear text", on_click=clear_text) st.write(text) from nltk.tokenize import word_tokenize text1 = re.sub(r'[^\w\s]','',text) tokens = word_tokenize(text1) st.write("Length", len(tokens)) st.divider() number = 1000 if text is not None and len(tokens) > number: st.warning('The length of your text should not exceed 1000 words.') st.stop() import time with st.spinner('Wait for it...'): time.sleep(5) if text is not None: token_classifier = pipeline(model="dslim/bert-base-NER", aggregation_strategy="simple") tokens = token_classifier(text) df = pd.DataFrame(tokens) import zipfile import io dfa = pd.DataFrame( data = { 'PER': ['Person'], 'ORG': ['Organization'], 'LOC': ['Location'], 'MISC': ['Miscellaneous'], } ) buf = io.BytesIO() with zipfile.ZipFile(buf, "x") as myzip: if text is not None: myzip.writestr("Summary of the results.csv", df.to_csv()) myzip.writestr("Glossary of tags.csv", dfa.to_csv()) tab1, tab2 = st.tabs(["Summarize", "Download"]) with tab1: if text is not None: st.dataframe(df, width = 1000) with tab2: st.download_button( label = "Download zip file", data=buf.getvalue(), file_name="zip file.zip", mime="application/zip", ) with st.expander("Limitations and Bias"): st.write(''' The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts. ''')