Spaces:
Runtime error
Runtime error
File size: 5,588 Bytes
dd124ec 55d03cf dd124ec 55d03cf dd124ec e21ac83 dd124ec 4413f1a 811e4f7 dd124ec 2a53bc3 4bd6367 dd124ec f75d001 ef1d02b a087ad0 f75d001 b1b6b1b f75d001 65ba45d f75d001 dd124ec 121a5b7 a08237b dd124ec a08237b dd124ec df53ec6 dd124ec ab1b7b5 9aad845 6dd0ff4 b1b6b1b 9aad845 a08237b 9aad845 a08237b f75d001 65ba45d 4bd6367 811e4f7 306e6e1 4bd6367 b1b6b1b 4bd6367 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# set path
import glob, os, sys; sys.path.append('../scripts')
#import helper
import scripts.process as pre
import scripts.clean as clean
#import needed libraries
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pandas as pd
from sklearn.feature_extraction import _stop_words
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader, TfidfRetriever
import string
from markdown import markdown
from annotated_text import annotation
from tqdm.autonotebook import tqdm
import numpy as np
import tempfile
import logging
logger = logging.getLogger(__name__)
#Haystack Components
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack(documents_processed):
document_store = InMemoryDocumentStore()
document_store.write_documents(documents_processed)
retriever = TfidfRetriever(document_store=document_store)
reader = FARMReader(model_name_or_path="deepset/tinyroberta-squad2", use_gpu=True) #deepset/roberta-base-squad2
pipeline = ExtractiveQAPipeline(reader, retriever)
return pipeline
def ask_question(question,pipeline):
prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
results = []
for answer in prediction["answers"]:
answer = answer.to_dict()
if answer["answer"]:
results.append(
{
"context": "..." + answer["context"] + "...",
"answer": answer["answer"],
"relevance": round(answer["score"] * 100, 2),
"offset_start_in_doc": answer["offsets_in_document"][0]["start"],
}
)
else:
results.append(
{
"context": None,
"answer": None,
"relevance": round(answer["score"] * 100, 2),
}
)
return results
def app():
with st.container():
st.markdown("<h1 style='text-align: center; color: black;'> Keyword Search</h1>", unsafe_allow_html=True)
st.write(' ')
st.write(' ')
with st.expander("βΉοΈ - About this app", expanded=False):
st.write("""
The *Keyword Search* app is an easy-to-use interface built in Streamlit for doing keyword search in policy document - developed by GIZ Data and the Sustainable Development Solution Network.
""")
st.markdown("")
#st.markdown("")
#st.markdown("## π Step One: Upload document ")
with st.container():
file = st.session_state['file']
question = st.text_input("Please enter your question here, we will look for the answer in the document.",
value="What particular risks does climate change pose for women?",)
if st.button("Find them."):
#file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
if file is not None:
with tempfile.NamedTemporaryFile(mode="wb") as temp:
bytes_data = file.getvalue()
temp.write(bytes_data)
file_name = file.name
file_path = temp.name
st.write("Filename: ", file.name)
# load document
documents = pre.load_document(temp.name,file_name)
documents_processed = pre.preprocessing(documents)
pipeline = start_haystack(documents_processed)
#if st.button("Find them."):
with st.spinner("π Performing semantic search on"):#+file.name+"..."):
try:
msg = 'Asked ' + question
logging.info(msg)
results = ask_question(question,pipeline)
st.write('## Top Results')
#st.write(results)
for count, result in enumerate(results):
if result["answer"]:
answer, context = result["answer"], result["context"]
start_idx = context.find(answer)
end_idx = start_idx + len(answer)
st.write(
markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[end_idx:]),
unsafe_allow_html=True,
)
st.markdown(f"**Relevance:** {result['relevance']}")
else:
st.info(
"π€ Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
)
except Exception as e:
logging.exception(e)
else:
st.info("π€ No document found, please try to upload it at the sidebar!"
)
|