Spaces:

nlpblogs
/

aientityextractor

Running

App Files Files Community

aientityextractor / app.py

nlpblogs

Create app.py

b00a147 verified about 2 months ago

raw

history blame

3.22 kB

	import streamlit as st
	import pandas as pd
	from streamlit_extras.stylable_container import stylable_container
	import time
	import zipfile
	import io
	import nltk
	nltk.download('punkt_tab')
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	from transformers import pipeline
	import re



	with st.sidebar:
	with stylable_container(
	key="test_button",

	css_styles="""
	button {
	background-color: #0000ff;
	border: none;
	color: white;
	}
	""",
	):
	st.button("DEMO APP")


	st.subheader("Glossary of tags", divider = "red")


	per = st.checkbox("I")
	if per:
	st.write("Person's name")

	org = st.checkbox("ORG")
	if org:
	st.write("Organization")

	loc = st.checkbox("LOC")
	if loc:
	st.write("Location")

	PER = st.checkbox("B-PER")
	if PER:
	st.write("Beginning of a person’s name right after another person’s name")

	ORG = st.checkbox("B-ORG")
	if ORG:
	st.write("Beginning of an organisation right after another organization")

	LOC = st.checkbox("B-LOC")
	if LOC:
	st.write("Beginning of a location right after another location")

	O = st.checkbox("O")
	if O:
	st.write("Outside of a named entity")








	st.subheader(":blue[AI Entity Extractor]")

	st.divider()


	def clear_text():
	st.session_state["text"] = ""

	text = st.text_input("Paste your text here and then press enter. The length of your text should not exceed 2000 words.", key="text")
	st.button("Clear text", on_click=clear_text)
	st.write(text)



	from nltk.tokenize import word_tokenize

	text1 = re.sub(r'[^\w\s]','',text)
	tokens = word_tokenize(text1)
	st.write("Length", len(tokens))
	st.divider()

	number = 2000

	if text is not None and len(tokens) > number:
	st.warning('The length of your text should not exceed 2000 words.')
	st.stop()



	if text is not None:
	token_classifier = pipeline(model="Davlan/bert-base-multilingual-cased-ner-hrl", aggregation_strategy="simple")

	tokens = token_classifier(text)

	df = pd.DataFrame(tokens)
	df = df.drop(df[df['word'] == '##s'].index)



	import zipfile
	import io

	dfa = pd.DataFrame(
	data = {
	'I': ['Person'],
	'ORG': ['Organization'],
	'LOC': ['Location'],
	'B-PER': ['Beginning of a person’s name right after another person’s name'],
	'B-ORG': ['Beginning of an organisation right after another organization '],
	'B-LOC': ['Beginning of a location right after another location'],
	'O': ['Outside of a named entity ']


	}
	)





	buf = io.BytesIO()

	with zipfile.ZipFile(buf, "x") as myzip:
	if text is not None:
	myzip.writestr("Summary of the results.csv", df.to_csv())

	myzip.writestr("Glossary of tags.csv", dfa.to_csv())


	tab1, tab2 = st.tabs(["Summarize", "Download"])


	with tab1:
	if text is not None:
	st.dataframe(df, width = 1000)



	with tab2:
	st.download_button(
	label = "Download zip file",
	data=buf.getvalue(),
	file_name="zip file.zip",
	mime="application/zip",
	)