idr / app.py
daneshjoy's picture
combined doc store creation and main app
e230889
raw
history blame
6.34 kB
import os
import streamlit as st
from lfqa import prepare, answer
# %% ------------------------------------------- Creating Doc store
if not os.path.exists(faiss_doc_store.db) or not os.path.exits(faiss_index.faiss):
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
from haystack.utils import convert_files_to_docs, clean_wiki_text
module_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(module_dir)
doc_dir = "data/wiki_gameofthrones_txt12"
sql_file = 'faiss_doc_store.db'
faiss_file = 'faiss_index.faiss'
# %% Download/Load Docs
# Get some files that we want to use
# s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"
# fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
print('---> Loading Documents ...')
# Convert files to docs + cleaning
docs = convert_files_to_docs(dir_path=doc_dir,
clean_func=clean_wiki_text,
split_paragraphs=True)
# %% Document Store
print('---> Creating document store ...')
document_store = FAISSDocumentStore(embedding_dim=128,
faiss_index_factory_str="Flat",
sql_url=f"sqlite:///{sql_file}")
# %% Retriever (DPR)
print('---> Initializing retriever ...')
retriever = DensePassageRetriever(
document_store=document_store,
query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
use_gpu=True
)
# %% Create Embeddings and save results
document_store.update_embeddings(retriever)
print('---> Saving results ...')
# update db
document_store.write_documents(docs)
# save faiss file
document_store.save(faiss_file)
print('Done!')
# %% ------------------------------------------- Main App
# Sliders
DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
# Adjust to a question that you would like users to see in the search bar when they load the UI:
DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "Tell me something about Arya Stark?")
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
def reset_results(*args):
st.session_state.answer = None
st.session_state.results = None
def main(pipe):
st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")
# Persistent state
set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
set_state_if_absent("results", None)
st.write("# Long-Form Question Answering")
st.markdown("""
This demo takes its data from a selection of Wikipedia pages on the topic of the **Game of Thrones** TV series
""")
# Sidebar
st.sidebar.header("Options")
top_k_retriever = st.sidebar.slider(
"Max. number of documents from retriever",
min_value=1,
max_value=10,
value=DEFAULT_DOCS_FROM_RETRIEVER,
step=1,
on_change=reset_results,
)
# eval_mode = st.sidebar.checkbox("Evaluation mode")
# debug = st.sidebar.checkbox("Show debug info")
st.sidebar.markdown(
"""
<style>
a {{
text-decoration: none;
}}
.haystack-footer {{
text-align: center;
}}
.haystack-footer h4 {{
margin: 0.1rem;
padding:0;
}}
footer {{
opacity: 0;
}}
</style>
<div class="haystack-footer">
<hr />
<h4>Built with <a href="https://www.deepset.ai/haystack">Haystack</a></h4>
<p>Get it on <a href="https://github.com/deepset-ai/haystack/">GitHub</a> &nbsp;&nbsp; - &nbsp;&nbsp; Read the <a href="https://haystack.deepset.ai/overview/intro">Docs</a></p>
<small>Dataset link: <a href="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"">Game of Thrones Wiki</a> <br />See the <a href="https://creativecommons.org/licenses/by-sa/3.0/">License</a> (CC BY-SA 3.0).</small>
</div>
""",
unsafe_allow_html=True,
)
# Search bar
question = st.text_input(
value=st.session_state.question,
max_chars=100,
on_change=reset_results,
label="question",
label_visibility="hidden",
)
col1, col2 = st.columns(2)
col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
# Run button
run_pressed = col1.button("Run")
run_query = run_pressed or (question != st.session_state.question)
if run_query and question:
reset_results()
st.session_state.question = question
with st.spinner(
"🧠 &nbsp;&nbsp; Performing neural search on documents... \n "):
try:
st.session_state.results = answer(pipe, question, top_k_retriever)
# except JSONDecodeError as je:
# st.error("πŸ‘“ &nbsp;&nbsp; An error occurred reading the results. Is the document store working?")
# return
except Exception as e:
# logging.exception(e)
if "The server is busy processing requests" in str(e) or "503" in str(e):
st.error("πŸ§‘β€πŸŒΎ &nbsp;&nbsp; All our workers are busy! Try again later.")
else:
st.error("🐞 &nbsp;&nbsp; An error occurred during the request.")
return
if st.session_state.results:
st.session_state.answer = st.session_state.results['answers'][0].answer
st.write(st.session_state.answer)
st.write('Doc IDs:')
st.write(st.session_state.results['answers'][0].meta['doc_ids'])
st.write('Doc Scores:')
st.write(st.session_state.results['answers'][0].meta['doc_scores'])
for i in range(top_k_retriever):
st.write(st.session_state.results['answers'][0].meta['content'][i])
st.markdown('---\n')
pipe = prepare()
main(pipe)