Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
from streamlit_extras.stylable_container import stylable_container | |
import time | |
import zipfile | |
import io | |
import nltk | |
nltk.download('punkt_tab') | |
from transformers import AutoTokenizer, AutoModelForTokenClassification | |
from transformers import pipeline | |
import re | |
with st.sidebar: | |
with stylable_container( | |
key="test_button", | |
css_styles=""" | |
button { | |
background-color: #0000ff; | |
border: none; | |
color: white; | |
} | |
""", | |
): | |
st.button("DEMO APP") | |
st.subheader("Glossary of tags", divider = "red") | |
per = st.checkbox("I") | |
if per: | |
st.write("Person's name") | |
org = st.checkbox("ORG") | |
if org: | |
st.write("Organization") | |
loc = st.checkbox("LOC") | |
if loc: | |
st.write("Location") | |
PER = st.checkbox("B-PER") | |
if PER: | |
st.write("Beginning of a person’s name right after another person’s name") | |
ORG = st.checkbox("B-ORG") | |
if ORG: | |
st.write("Beginning of an organisation right after another organization") | |
LOC = st.checkbox("B-LOC") | |
if LOC: | |
st.write("Beginning of a location right after another location") | |
O = st.checkbox("O") | |
if O: | |
st.write("Outside of a named entity") | |
st.subheader(":blue[AI Entity Extractor]") | |
st.divider() | |
def clear_text(): | |
st.session_state["text"] = "" | |
text = st.text_input("Paste your text here and then press **enter**. The length of your text should not exceed 2000 words.", key="text") | |
st.button("Clear text", on_click=clear_text) | |
st.write(text) | |
from nltk.tokenize import word_tokenize | |
text1 = re.sub(r'[^\w\s]','',text) | |
tokens = word_tokenize(text1) | |
st.write("Length", len(tokens)) | |
st.divider() | |
number = 2000 | |
if text is not None and len(tokens) > number: | |
st.warning('The length of your text should not exceed 2000 words.') | |
st.stop() | |
if text is not None: | |
token_classifier = pipeline(model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple") | |
tokens = token_classifier(text) | |
df = pd.DataFrame(tokens) | |
df = df.drop(df[df['word'] == '##s'].index) | |
import zipfile | |
import io | |
dfa = pd.DataFrame( | |
data = { | |
'I': ['Person'], | |
'ORG': ['Organization'], | |
'LOC': ['Location'], | |
'B-PER': ['Beginning of a person’s name right after another person’s name'], | |
'B-ORG': ['Beginning of an organisation right after another organization '], | |
'B-LOC': ['Beginning of a location right after another location'], | |
'O': ['Outside of a named entity '] | |
} | |
) | |
buf = io.BytesIO() | |
with zipfile.ZipFile(buf, "x") as myzip: | |
if text is not None: | |
myzip.writestr("Summary of the results.csv", df.to_csv()) | |
myzip.writestr("Glossary of tags.csv", dfa.to_csv()) | |
tab1, tab2 = st.tabs(["Summarize", "Download"]) | |
with tab1: | |
if text is not None: | |
st.dataframe(df, width = 1000) | |
with tab2: | |
st.download_button( | |
label = "Download zip file", | |
data=buf.getvalue(), | |
file_name="zip file.zip", | |
mime="application/zip", | |
) | |