File size: 6,343 Bytes
c8b3fc9 e230889 c8b3fc9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import os
import streamlit as st
from lfqa import prepare, answer
# %% ------------------------------------------- Creating Doc store
if not os.path.exists(faiss_doc_store.db) or not os.path.exits(faiss_index.faiss):
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
from haystack.utils import convert_files_to_docs, clean_wiki_text
module_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(module_dir)
doc_dir = "data/wiki_gameofthrones_txt12"
sql_file = 'faiss_doc_store.db'
faiss_file = 'faiss_index.faiss'
# %% Download/Load Docs
# Get some files that we want to use
# s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"
# fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
print('---> Loading Documents ...')
# Convert files to docs + cleaning
docs = convert_files_to_docs(dir_path=doc_dir,
clean_func=clean_wiki_text,
split_paragraphs=True)
# %% Document Store
print('---> Creating document store ...')
document_store = FAISSDocumentStore(embedding_dim=128,
faiss_index_factory_str="Flat",
sql_url=f"sqlite:///{sql_file}")
# %% Retriever (DPR)
print('---> Initializing retriever ...')
retriever = DensePassageRetriever(
document_store=document_store,
query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki",
passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki",
use_gpu=True
)
# %% Create Embeddings and save results
document_store.update_embeddings(retriever)
print('---> Saving results ...')
# update db
document_store.write_documents(docs)
# save faiss file
document_store.save(faiss_file)
print('Done!')
# %% ------------------------------------------- Main App
# Sliders
DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
# Adjust to a question that you would like users to see in the search bar when they load the UI:
DEFAULT_QUESTION_AT_STARTUP = os.getenv("DEFAULT_QUESTION_AT_STARTUP", "Tell me something about Arya Stark?")
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
def reset_results(*args):
st.session_state.answer = None
st.session_state.results = None
def main(pipe):
st.set_page_config(page_title="Haystack Demo", page_icon="https://haystack.deepset.ai/img/HaystackIcon.png")
# Persistent state
set_state_if_absent("question", DEFAULT_QUESTION_AT_STARTUP)
set_state_if_absent("results", None)
st.write("# Long-Form Question Answering")
st.markdown("""
This demo takes its data from a selection of Wikipedia pages on the topic of the **Game of Thrones** TV series
""")
# Sidebar
st.sidebar.header("Options")
top_k_retriever = st.sidebar.slider(
"Max. number of documents from retriever",
min_value=1,
max_value=10,
value=DEFAULT_DOCS_FROM_RETRIEVER,
step=1,
on_change=reset_results,
)
# eval_mode = st.sidebar.checkbox("Evaluation mode")
# debug = st.sidebar.checkbox("Show debug info")
st.sidebar.markdown(
"""
<style>
a {{
text-decoration: none;
}}
.haystack-footer {{
text-align: center;
}}
.haystack-footer h4 {{
margin: 0.1rem;
padding:0;
}}
footer {{
opacity: 0;
}}
</style>
<div class="haystack-footer">
<hr />
<h4>Built with <a href="https://www.deepset.ai/haystack">Haystack</a></h4>
<p>Get it on <a href="https://github.com/deepset-ai/haystack/">GitHub</a> - Read the <a href="https://haystack.deepset.ai/overview/intro">Docs</a></p>
<small>Dataset link: <a href="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip"">Game of Thrones Wiki</a> <br />See the <a href="https://creativecommons.org/licenses/by-sa/3.0/">License</a> (CC BY-SA 3.0).</small>
</div>
""",
unsafe_allow_html=True,
)
# Search bar
question = st.text_input(
value=st.session_state.question,
max_chars=100,
on_change=reset_results,
label="question",
label_visibility="hidden",
)
col1, col2 = st.columns(2)
col1.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
# Run button
run_pressed = col1.button("Run")
run_query = run_pressed or (question != st.session_state.question)
if run_query and question:
reset_results()
st.session_state.question = question
with st.spinner(
"π§ Performing neural search on documents... \n "):
try:
st.session_state.results = answer(pipe, question, top_k_retriever)
# except JSONDecodeError as je:
# st.error("π An error occurred reading the results. Is the document store working?")
# return
except Exception as e:
# logging.exception(e)
if "The server is busy processing requests" in str(e) or "503" in str(e):
st.error("π§βπΎ All our workers are busy! Try again later.")
else:
st.error("π An error occurred during the request.")
return
if st.session_state.results:
st.session_state.answer = st.session_state.results['answers'][0].answer
st.write(st.session_state.answer)
st.write('Doc IDs:')
st.write(st.session_state.results['answers'][0].meta['doc_ids'])
st.write('Doc Scores:')
st.write(st.session_state.results['answers'][0].meta['doc_scores'])
for i in range(top_k_retriever):
st.write(st.session_state.results['answers'][0].meta['content'][i])
st.markdown('---\n')
pipe = prepare()
main(pipe)
|