Spaces:

peter2000
/

policy_test

Runtime error

App Files Files Community

policy_test / appStore /keyword_search.py

peter2000

Update appStore/keyword_search.py

4413f1a about 3 years ago

raw

history blame

5.45 kB

	# set path
	import glob, os, sys; sys.path.append('../scripts')

	#import helper
	import scripts.process as pre
	import scripts.clean as clean

	#import needed libraries
	import seaborn as sns
	from pandas import DataFrame
	import matplotlib.pyplot as plt
	import numpy as np
	import streamlit as st
	import pandas as pd
	from sklearn.feature_extraction import _stop_words
	from haystack.document_stores import InMemoryDocumentStore
	from haystack.pipelines import ExtractiveQAPipeline
	from haystack.nodes import FARMReader, TfidfRetriever
	import string
	from markdown import markdown
	from tqdm.autonotebook import tqdm
	import numpy as np
	import tempfile
	import logging
	logger = logging.getLogger(__name__)

	#Haystack Components
	@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)

	def start_haystack(documents_processed):
	document_store = InMemoryDocumentStore()
	document_store.write_documents(documents_processed)
	retriever = TfidfRetriever(document_store=document_store)
	reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2-distilled", use_gpu=True)
	pipeline = ExtractiveQAPipeline(reader, retriever)
	return pipeline

	def ask_question(question,pipeline):
	prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
	results = []
	for answer in prediction["answers"]:
	answer = answer.to_dict()
	if answer["answer"]:
	results.append(
	{
	"context": "..." + answer["context"] + "...",
	"answer": answer["answer"],
	"relevance": round(answer["score"] * 100, 2),
	"offset_start_in_doc": answer["offsets_in_document"][0]["start"],
	}
	)
	else:
	results.append(
	{
	"context": None,
	"answer": None,
	"relevance": round(answer["score"] * 100, 2),
	}
	)
	return results

	def app():
	with st.container():
	st.markdown("<h1 style='text-align: center; color: black;'> Keyword Search</h1>", unsafe_allow_html=True)
	st.write(' ')
	st.write(' ')

	with st.expander("ℹ️ - About this app", expanded=False):

	st.write(
	"""
	The Keyword Search app is an easy-to-use interface built in Streamlit for doing keyword search in policy document - developed by GIZ Data and the Sustainable Development Solution Network.
	"""
	)

	st.markdown("")
	st.markdown("")
	st.markdown("## 📌 Step One: Upload document ")

	with st.container():
	file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
	if file is not None:
	with tempfile.NamedTemporaryFile(mode="wb") as temp:
	bytes_data = file.getvalue()
	temp.write(bytes_data)
	file_name = file.name
	file_path = temp.name

	st.write("Filename: ", file.name)

	# load document
	documents = pre.load_document(temp.name,file_name)
	documents_processed = pre.preprocessing(documents)
	pipeline = start_haystack(documents_processed)
	#docs = pre.load_document(temp.name, file)

	# preprocess document
	#haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)

	question = st.text_input("Please enter your question here, we will look for the answer in the document.",
	value="floods",)


	if st.button("Find them."):
	with st.spinner("👑 Performing semantic search on"):#+file.name+"..."):
	try:
	msg = 'Asked ' + question
	logging.info(msg)
	results = ask_question(question,pipeline)
	st.write('## Top Results')
	st.write(results)
	for count, result in enumerate(results):
	if result["answer"]:
	answer, context = result["answer"], result["context"]
	start_idx = context.find(answer)
	end_idx = start_idx + len(answer)
	st.write(
	markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[end_idx:]),
	unsafe_allow_html=True,
	)
	st.markdown(f"Relevance: {result['relevance']}")
	else:
	st.info(
	"🤔    Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
	)

	except Exception as e:
	logging.exception(e)