Spaces:

daneshjoy
/

idr

Runtime error

App Files Files Community

idr / app.py

daneshjoy

combined doc store creation and main app

e230889 over 2 years ago

raw

history blame

6.34 kB

	import os

	import streamlit as st

	from lfqa import prepare, answer

	# %% ------------------------------------------- Creating Doc store
	if not os.path.exists(faiss_doc_store.db) or not os.path.exits(faiss_index.faiss):
	from haystack.document_stores import FAISSDocumentStore
	from haystack.nodes import DensePassageRetriever
	from haystack.utils import convert_files_to_docs, clean_wiki_text


	module_dir = os.path.dirname(os.path.abspath(__file__))
	os.chdir(module_dir)

	doc_dir = "data/wiki_gameofthrones_txt12"
	sql_file = 'faiss_doc_store.db'
	faiss_file = 'faiss_index.faiss'

	# %% Download/Load Docs

	# Get some files that we want to use
	# s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"
	# fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

	print('---> Loading Documents ...')

	# Convert files to docs + cleaning
	docs = convert_files_to_docs(dir_path=doc_dir,
	clean_func=clean_wiki_text,
	split_paragraphs=True)

	# %% Document Store

	print('---> Creating document store ...')
	document_store = FAISSDocumentStore(embedding_dim=128,
	faiss_index_factory_str="Flat",
	sql_url=f"sqlite:///{sql_file}")



	# %% Retriever (DPR)

	print('---> Initializing retriever ...')
	retriever = DensePassageRetriever(
	document_store=document_store,
	query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
	passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
	use_gpu=True
	)

	# %% Create Embeddings and save results
	document_store.update_embeddings(retriever)

	print('---> Saving results ...')
	# update db
	document_store.write_documents(docs)
	# save faiss file
	document_store.save(faiss_file)

	print('Done!')


	# %% ------------------------------------------- Main App


	# Sliders
	DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
	# Adjust to a question that you would like users to see in the search bar when they load the UI:
	DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "Tell me something about Arya Stark?")


	def set_state_if_absent(key, value):
	if key not in st.session_state:
	st.session_state[key] = value

	def reset_results(*args):
	st.session_state.answer = None
	st.session_state.results = None

	def main(pipe):
	st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")

	# Persistent state
	set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
	set_state_if_absent("results", None)

	st.write("# Long-Form Question Answering")
	st.markdown("""
	This demo takes its data from a selection of Wikipedia pages on the topic of the Game of Thrones TV series
	""")

	# Sidebar
	st.sidebar.header("Options")
	top_k_retriever = st.sidebar.slider(
	"Max. number of documents from retriever",
	min_value=1,
	max_value=10,
	value=DEFAULT_DOCS_FROM_RETRIEVER,
	step=1,
	on_change=reset_results,
	)
	# eval_mode = st.sidebar.checkbox("Evaluation mode")
	# debug = st.sidebar.checkbox("Show debug info")


	st.sidebar.markdown(
	"""
	<style>
	a {{
	text-decoration: none;
	}}
	.haystack-footer {{
	text-align: center;
	}}
	.haystack-footer h4 {{
	margin: 0.1rem;
	padding:0;
	}}
	footer {{
	opacity: 0;
	}}
	</style>
	<div class="haystack-footer">
	<hr />
	<h4>Built with <a href="https://www.deepset.ai/haystack">Haystack</a></h4>
	<p>Get it on <a href="https://github.com/deepset-ai/haystack/">GitHub</a>    -    Read the <a href="https://haystack.deepset.ai/overview/intro">Docs</a></p>
	<small>Dataset link: <a href="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"">Game of Thrones Wiki</a> <br />See the <a href="https://creativecommons.org/licenses/by-sa/3.0/">License</a> (CC BY-SA 3.0).</small>
	</div>
	""",
	unsafe_allow_html=True,
	)

	# Search bar
	question = st.text_input(
	value=st.session_state.question,
	max_chars=100,
	on_change=reset_results,
	label="question",
	label_visibility="hidden",
	)
	col1, col2 = st.columns(2)
	col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)

	# Run button
	run_pressed = col1.button("Run")

	run_query = run_pressed or (question != st.session_state.question)

	if run_query and question:
	reset_results()
	st.session_state.question = question

	with st.spinner(
	"🧠    Performing neural search on documents... \n "):
	try:
	st.session_state.results = answer(pipe, question, top_k_retriever)
	# except JSONDecodeError as je:
	# st.error("👓    An error occurred reading the results. Is the document store working?")
	# return
	except Exception as e:
	# logging.exception(e)
	if "The server is busy processing requests" in str(e) or "503" in str(e):
	st.error("🧑‍🌾    All our workers are busy! Try again later.")
	else:
	st.error("🐞    An error occurred during the request.")
	return

	if st.session_state.results:
	st.session_state.answer = st.session_state.results['answers'][0].answer
	st.write(st.session_state.answer)
	st.write('Doc IDs:')
	st.write(st.session_state.results['answers'][0].meta['doc_ids'])
	st.write('Doc Scores:')
	st.write(st.session_state.results['answers'][0].meta['doc_scores'])
	for i in range(top_k_retriever):
	st.write(st.session_state.results['answers'][0].meta['content'][i])
	st.markdown('---\n')

	pipe = prepare()
	main(pipe)