Spaces:

nlpblogs
/

aientityextractor

Running

App Files Files Community

aientityextractor / app.py

nlpblogs

Update app.py

8cae1d5 verified about 2 months ago

raw

history blame

3.79 kB

	import streamlit as st
	import pandas as pd
	from streamlit_extras.stylable_container import stylable_container
	import time
	import zipfile
	import io
	import nltk
	nltk.download('punkt_tab')
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	from transformers import pipeline
	import re



	with st.sidebar:
	with stylable_container(
	key="test_button",

	css_styles="""
	button {
	background-color: #0000ff;
	border: none;
	color: white;
	}
	""",
	):
	st.button("DEMO APP")


	st.subheader("Glossary", divider = "red")




	with st.expander("PER"):
	st.write('''
	Person's name
	''')

	with st.expander("ORG"):
	st.write('''
	Organization name
	''')

	with st.expander("LOC"):
	st.write('''
	Location name
	''')

	with st.expander("MISC"):
	st.write('''
	Miscellaneous
	''')

	with st.expander("entity_group"):
	st.write('''
	This is the tag that has been assigned to an entity.
	''')

	with st.expander("score"):
	st.write('''
	This indicates the confidence level that a tag has been assigned to an entity.
	''')

	with st.expander("word"):
	st.write('''
	This is the entity that has been extracted from your text data.
	''')

	with st.expander("start"):
	st.write('''
	This is the index of the first character of the entity in your text data.
	''')

	with st.expander("end"):
	st.write('''
	This is the index of the character immediately after the last character of the entity.
	''')







	st.subheader(":blue[AI Entity Extractor]")
	st.write("made by [nlpblogs](https://nlpblogs.com/)")
	st.write("Apache 2.0")

	st.divider()


	def clear_text():
	st.session_state["text"] = ""

	text = st.text_area("Paste your text here and then press Ctrl + Enter. The length of your text should not exceed 1000 words.", key="text")
	st.button("Clear text", on_click=clear_text)
	st.write(text)



	from nltk.tokenize import word_tokenize

	text1 = re.sub(r'[^\w\s]','',text)
	tokens = word_tokenize(text1)
	st.write("Length", len(tokens))
	st.divider()

	number = 1000

	if text is not None and len(tokens) > number:
	st.warning('The length of your text should not exceed 1000 words.')
	st.stop()


	import time
	with st.spinner('Wait for it...'):
	time.sleep(5)
	if text is not None:
	token_classifier = pipeline(model="dslim/bert-base-NER", aggregation_strategy="simple")
	tokens = token_classifier(text)
	df = pd.DataFrame(tokens)




	import zipfile
	import io

	dfa = pd.DataFrame(
	data = {
	'PER': ['Person'],
	'ORG': ['Organization'],
	'LOC': ['Location'],
	'MISC': ['Miscellaneous'],



	}
	)





	buf = io.BytesIO()

	with zipfile.ZipFile(buf, "x") as myzip:
	if text is not None:
	myzip.writestr("Summary of the results.csv", df.to_csv())

	myzip.writestr("Glossary of tags.csv", dfa.to_csv())


	tab1, tab2 = st.tabs(["Summarize", "Download"])


	with tab1:
	if text is not None:
	st.dataframe(df, width = 1000)



	with tab2:
	st.download_button(
	label = "Download zip file",
	data=buf.getvalue(),
	file_name="zip file.zip",
	mime="application/zip",
	)





	with st.expander("Limitations and Bias"):
	st.write('''
	The Named Entity Recognition (NER) model used in this demo app is limited by its training dataset of entity-annotated news articles from a specific span of time. This means that it might not perform excellent for all use cases in different domains. Furthermore, the model may occassionally split words into different parts.
	''')