Spaces:
Runtime error
Runtime error
File size: 5,232 Bytes
dd124ec 55d03cf dd124ec 55d03cf dd124ec f75d001 dd124ec f75d001 dd124ec f75d001 dd124ec f75d001 dd124ec f75d001 dd124ec f75d001 dd124ec f75d001 dd124ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# set path
import glob, os, sys; sys.path.append('../scripts')
#import helper
import scripts.process as pre
import scripts.clean as clean
#import needed libraries
import seaborn as sns
from pandas import DataFrame
from sentence_transformers import SentenceTransformer, CrossEncoder, util
# from keybert import KeyBERT
from transformers import pipeline
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pandas as pd
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np
import tempfile
import sqlite3
#Haystack Components
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack(temp.name, file):
document_store = InMemoryDocumentStore()
documents = pre.load_document(temp.name, file)
documents_processed = pre.preprocessing(documents)
document_store.write_documents(documents_processed)
retriever = TfidfRetriever(document_store=document_store)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2-distilled", use_gpu=True)
pipeline = ExtractiveQAPipeline(reader, retriever)
return pipeline
def ask_question(question):
prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
results = []
for answer in prediction["answers"]:
answer = answer.to_dict()
if answer["answer"]:
results.append(
{
"context": "..." + answer["context"] + "...",
"answer": answer["answer"],
"relevance": round(answer["score"] * 100, 2),
"offset_start_in_doc": answer["offsets_in_document"][0]["start"],
}
)
else:
results.append(
{
"context": None,
"answer": None,
"relevance": round(answer["score"] * 100, 2),
}
)
return results
def app():
with st.container():
st.markdown("<h1 style='text-align: center; color: black;'> Keyword Search</h1>", unsafe_allow_html=True)
st.write(' ')
st.write(' ')
with st.expander("โน๏ธ - About this app", expanded=True):
st.write(
"""
The *Keyword Search* app is an easy-to-use interface built in Streamlit for doing keyword search in policy document - developed by GIZ Data and the Sustainable Development Solution Network.
"""
)
st.markdown("")
st.markdown("")
st.markdown("## ๐ Step One: Upload document ")
with st.container():
file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
if file is not None:
with tempfile.NamedTemporaryFile(mode="wb") as temp:
bytes_data = file.getvalue()
temp.write(bytes_data)
st.write("Filename: ", file.name)
# load document
pipeline = start_haystack(temp.name, file)
#docs = pre.load_document(temp.name, file)
# preprocess document
#haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
question = st.text_input("Please enter your question here, we will look for the answer in the document.",
value="floods",)
if st.button("Find them."):
with st.spinner("๐ Performing semantic search on"):#+file.name+"..."):
try:
msg = 'Asked ' + question
logging.info(msg)
st.session_state.results = ask_question(question)
except Exception as e:
logging.exception(e)
if st.session_state.results:
st.write('## Top Results')
for count, result in enumerate(st.session_state.results):
if result["answer"]:
answer, context = result["answer"], result["context"]
start_idx = context.find(answer)
end_idx = start_idx + len(answer)
st.write(
markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[end_idx:]),
unsafe_allow_html=True,
)
st.markdown(f"**Relevance:** {result['relevance']}")
else:
st.info(
"๐ค Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
)
|