File size: 5,232 Bytes
dd124ec
55d03cf
dd124ec
 
55d03cf
 
dd124ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f75d001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd124ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f75d001
 
dd124ec
 
f75d001
dd124ec
f75d001
dd124ec
f75d001
dd124ec
 
 
f75d001
 
 
 
 
 
 
 
dd124ec
 
 
f75d001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd124ec
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# set path
import glob, os, sys; sys.path.append('../scripts')

#import helper
import scripts.process as pre
import scripts.clean as clean

#import needed libraries
import seaborn as sns
from pandas import DataFrame
from sentence_transformers import SentenceTransformer, CrossEncoder, util
# from keybert import KeyBERT
from transformers import pipeline
import matplotlib.pyplot as plt
import numpy as np
import streamlit as st
import pandas as pd 
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm
import numpy as np

import tempfile
import sqlite3

#Haystack Components
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},allow_output_mutation=True)
def start_haystack(temp.name, file):
    document_store = InMemoryDocumentStore()
    documents = pre.load_document(temp.name, file)
    documents_processed = pre.preprocessing(documents)
    document_store.write_documents(documents_processed)
    retriever = TfidfRetriever(document_store=document_store)
    reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2-distilled", use_gpu=True)
    pipeline = ExtractiveQAPipeline(reader, retriever)
    return pipeline


def ask_question(question):
    prediction = pipeline.run(query=question, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
    results = []
    for answer in prediction["answers"]:
        answer = answer.to_dict()
        if answer["answer"]:
            results.append(
                {
                    "context": "..." + answer["context"] + "...",
                    "answer": answer["answer"],
                    "relevance": round(answer["score"] * 100, 2),
                    "offset_start_in_doc": answer["offsets_in_document"][0]["start"],
                }
            )
        else:
            results.append(
                {
                    "context": None,
                    "answer": None,
                    "relevance": round(answer["score"] * 100, 2),
                }
            )
    return results

def app():

    with st.container():
        st.markdown("<h1 style='text-align: center; color: black;'> Keyword Search</h1>", unsafe_allow_html=True)
        st.write(' ')
        st.write(' ')

    with st.expander("โ„น๏ธ - About this app", expanded=True):

        st.write(
            """     
            The *Keyword Search* app is an easy-to-use interface built in Streamlit for doing keyword search in policy document - developed by GIZ Data and the Sustainable Development Solution Network.
            """
        )

        st.markdown("")

    st.markdown("")
    st.markdown("##  ๐Ÿ“Œ Step One: Upload document ")
    
    with st.container():

        file = st.file_uploader('Upload PDF File', type=['pdf', 'docx', 'txt'])
        
        if file is not None:
            
    
            with tempfile.NamedTemporaryFile(mode="wb") as temp:
                bytes_data = file.getvalue()
                temp.write(bytes_data)
            
                st.write("Filename: ", file.name)
                
                # load document
                pipeline = start_haystack(temp.name, file)
                #docs = pre.load_document(temp.name, file)

                # preprocess document
                #haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
                

                
                question = st.text_input("Please enter your question here, we will look for the answer in the document.",
                                         value="floods",)
                                         
                
                if st.button("Find them."):
                     with st.spinner("๐Ÿ‘‘ Performing semantic search on"):#+file.name+"..."):
                        try:
                            msg = 'Asked ' + question
                            logging.info(msg)
                            st.session_state.results = ask_question(question)    
                        except Exception as e:
                            logging.exception(e)
                    
                
                
                if st.session_state.results:
                    st.write('## Top Results')
                    for count, result in enumerate(st.session_state.results):
                        if result["answer"]:
                            answer, context = result["answer"], result["context"]
                            start_idx = context.find(answer)
                            end_idx = start_idx + len(answer)
                            st.write(
                                markdown(context[:start_idx] + str(annotation(body=answer, label="ANSWER", background="#964448", color='#ffffff')) + context[end_idx:]),
                                unsafe_allow_html=True,
                            )
                            st.markdown(f"**Relevance:** {result['relevance']}")
                        else:
                            st.info(
                                "๐Ÿค” &nbsp;&nbsp; Haystack is unsure whether any of the documents contain an answer to your question. Try to reformulate it!"
                            )