policy_test / appStore /keyword_search.py
peter2000's picture
Update appStore/keyword_search.py
4413f1a
raw
history blame
5.45 kB
# set path
import glob, os, sys; sys.path.append('../scripts')
#import helper
import scripts.process as pre
import scripts.clean as clean
#import needed libraries
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pandas as pd
from sklearn.feature_extraction import _stop_words
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader, TfidfRetriever
import string
from markdown import markdown
from tqdm.autonotebook import tqdm
import numpy as np
import tempfile
import logging
logger = logging.getLogger(__name__)
#Haystack Components
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack(documents_processed):
document_store = InMemoryDocumentStore()
document_store.write_documents(documents_processed)
retriever = TfidfRetriever(document_store=document_store)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2-distilled", use_gpu=True)
pipeline = ExtractiveQAPipeline(reader, retriever)
return pipeline
def ask_question(question,pipeline):
prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
results = []
for answer in prediction["answers"]:
answer = answer.to_dict()
if answer["answer"]:
results.append(
{
"context": "..." + answer["context"] + "...",
"answer": answer["answer"],
"relevance": round(answer["score"] * 100, 2),
"offset_start_in_doc": answer["offsets_in_document"][0]["start"],
}
)
else:
results.append(
{
"context": None,
"answer": None,
"relevance": round(answer["score"] * 100, 2),
}
)
return results
def app():
with st.container():
st.markdown("<h1 style='text-align: center; color: black;'> Keyword Search</h1>", unsafe_allow_html=True)
st.write(' ')
st.write(' ')
with st.expander("ℹ️ - About this app", expanded=False):
st.write(
"""
The *Keyword Search* app is an easy-to-use interface built in Streamlit for doing keyword search in policy document - developed by GIZ Data and the Sustainable Development Solution Network.
"""
)
st.markdown("")
st.markdown("")
st.markdown("## πŸ“Œ Step One: Upload document ")
with st.container():
file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
if file is not None:
with tempfile.NamedTemporaryFile(mode="wb") as temp:
bytes_data = file.getvalue()
temp.write(bytes_data)
file_name = file.name
file_path = temp.name
st.write("Filename: ", file.name)
# load document
documents = pre.load_document(temp.name,file_name)
documents_processed = pre.preprocessing(documents)
pipeline = start_haystack(documents_processed)
#docs = pre.load_document(temp.name, file)
# preprocess document
#haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
question = st.text_input("Please enter your question here, we will look for the answer in the document.",
value="floods",)
if st.button("Find them."):
with st.spinner("πŸ‘‘ Performing semantic search on"):#+file.name+"..."):
try:
msg = 'Asked ' + question
logging.info(msg)
results = ask_question(question,pipeline)
st.write('## Top Results')
st.write(results)
for count, result in enumerate(results):
if result["answer"]:
answer, context = result["answer"], result["context"]
start_idx = context.find(answer)
end_idx = start_idx + len(answer)
st.write(
markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[end_idx:]),
unsafe_allow_html=True,
)
st.markdown(f"**Relevance:** {result['relevance']}")
else:
st.info(
"πŸ€” &nbsp;&nbsp; Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
)
except Exception as e:
logging.exception(e)