Delete utils
Browse files- utils/__init__.py +0 -1
- utils/checkconfig.py +0 -15
- utils/keyword_extraction.py +0 -140
- utils/lexical_search.py +0 -251
- utils/ndc_explorer.py +0 -90
- utils/preprocessing.py +0 -260
- utils/sdg_classifier.py +0 -177
- utils/semantic_search.py +0 -582
- utils/streamlitcheck.py +0 -42
- utils/uploadAndExample.py +0 -33
utils/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# adding for package implementation
|
|
|
|
utils/checkconfig.py
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
import configparser
|
2 |
-
import logging
|
3 |
-
|
4 |
-
def getconfig(configfile_path:str):
|
5 |
-
"""
|
6 |
-
configfile_path: file path of .cfg file
|
7 |
-
"""
|
8 |
-
|
9 |
-
config = configparser.ConfigParser()
|
10 |
-
|
11 |
-
try:
|
12 |
-
config.read_file(open(configfile_path))
|
13 |
-
return config
|
14 |
-
except:
|
15 |
-
logging.warning("config file not found")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/keyword_extraction.py
DELETED
@@ -1,140 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
3 |
-
# import nltk
|
4 |
-
# nltk.download('stopwords')
|
5 |
-
# from nltk.corpus import stopwords
|
6 |
-
import pickle
|
7 |
-
from typing import List, Text
|
8 |
-
import logging
|
9 |
-
from summa import keywords
|
10 |
-
|
11 |
-
try:
|
12 |
-
import streamlit as st
|
13 |
-
except ImportError:
|
14 |
-
logging.info("Streamlit not installed")
|
15 |
-
|
16 |
-
|
17 |
-
def sort_coo(coo_matrix):
|
18 |
-
"""
|
19 |
-
It takes Coordinate format scipy sparse matrix and extracts info from same.\
|
20 |
-
1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
|
21 |
-
"""
|
22 |
-
tuples = zip(coo_matrix.col, coo_matrix.data)
|
23 |
-
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
|
24 |
-
|
25 |
-
def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
|
26 |
-
"""get the feature names and tf-idf score of top n items
|
27 |
-
|
28 |
-
Params
|
29 |
-
---------
|
30 |
-
feature_names: list of words from vectorizer
|
31 |
-
sorted_items: tuple returned by sort_coo function defined in \
|
32 |
-
keyword_extraction.py
|
33 |
-
topn: topn words to be extracted using tfidf
|
34 |
-
|
35 |
-
Return
|
36 |
-
----------
|
37 |
-
results: top extracted keywords
|
38 |
-
|
39 |
-
"""
|
40 |
-
|
41 |
-
#use only topn items from vector
|
42 |
-
sorted_items = sorted_items[:top_n]
|
43 |
-
score_vals = []
|
44 |
-
feature_vals = []
|
45 |
-
|
46 |
-
# word index and corresponding tf-idf score
|
47 |
-
for idx, score in sorted_items:
|
48 |
-
|
49 |
-
#keep track of feature name and its corresponding score
|
50 |
-
score_vals.append(round(score, 3))
|
51 |
-
feature_vals.append(feature_names[idx])
|
52 |
-
|
53 |
-
results= {}
|
54 |
-
for idx in range(len(feature_vals)):
|
55 |
-
results[feature_vals[idx]]=score_vals[idx]
|
56 |
-
|
57 |
-
return results
|
58 |
-
|
59 |
-
|
60 |
-
def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
|
61 |
-
"""
|
62 |
-
TFIDF based keywords extraction
|
63 |
-
|
64 |
-
Params
|
65 |
-
---------
|
66 |
-
vectorizer: trained cont vectorizer model
|
67 |
-
tfidfmodel: TFIDF Tranformer model
|
68 |
-
top_n: Top N keywords to be extracted
|
69 |
-
textdata: text data to which needs keyword extraction
|
70 |
-
|
71 |
-
Return
|
72 |
-
----------
|
73 |
-
keywords: top extracted keywords
|
74 |
-
|
75 |
-
"""
|
76 |
-
features = vectorizer.get_feature_names_out()
|
77 |
-
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
|
78 |
-
sorted_items=sort_coo(tf_idf_vector.tocoo())
|
79 |
-
results=extract_topn_from_vector(features,sorted_items,top_n)
|
80 |
-
keywords = [keyword for keyword in results]
|
81 |
-
return keywords
|
82 |
-
|
83 |
-
def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
|
84 |
-
"""
|
85 |
-
TFIDF based keywords extraction
|
86 |
-
|
87 |
-
Params
|
88 |
-
---------
|
89 |
-
sdg: which sdg tfidf model to be used
|
90 |
-
sdgdata: text data to which needs keyword extraction
|
91 |
-
|
92 |
-
|
93 |
-
Return
|
94 |
-
----------
|
95 |
-
keywords: top extracted keywords
|
96 |
-
|
97 |
-
"""
|
98 |
-
model_path = "docStore/sdg{}/".format(sdg)
|
99 |
-
vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
|
100 |
-
tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
|
101 |
-
features = vectorizer.get_feature_names_out()
|
102 |
-
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
|
103 |
-
sorted_items=sort_coo(tf_idf_vector.tocoo())
|
104 |
-
top_n = top_n
|
105 |
-
results=extract_topn_from_vector(features,sorted_items,top_n)
|
106 |
-
keywords = [keyword for keyword in results]
|
107 |
-
return keywords
|
108 |
-
|
109 |
-
@st.cache(allow_output_mutation=True)
|
110 |
-
def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
|
111 |
-
"""
|
112 |
-
wrappper function to perform textrank, uses either ratio or wordcount to
|
113 |
-
extract top keywords limited by words or ratio.
|
114 |
-
1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
|
115 |
-
|
116 |
-
Params
|
117 |
-
--------
|
118 |
-
textdata: text data to perform the textrank.
|
119 |
-
ratio: float to limit the number of keywords as proportion of total token \
|
120 |
-
in textdata
|
121 |
-
words: number of keywords to be extracted. Takes priority over ratio if \
|
122 |
-
Non zero. Howevr incase the pagerank returns lesser keywords than \
|
123 |
-
compared to fix value then ratio is used.
|
124 |
-
|
125 |
-
Return
|
126 |
-
--------
|
127 |
-
results: extracted keywords
|
128 |
-
"""
|
129 |
-
if words == 0:
|
130 |
-
logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
|
131 |
-
results = keywords.keywords(textdata, ratio= ratio).split("\n")
|
132 |
-
else:
|
133 |
-
try:
|
134 |
-
results = keywords.keywords(textdata, words= words).split("\n")
|
135 |
-
except:
|
136 |
-
results = keywords.keywords(textdata, ratio = ratio).split("\n")
|
137 |
-
|
138 |
-
return results
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/lexical_search.py
DELETED
@@ -1,251 +0,0 @@
|
|
1 |
-
from haystack.nodes import TfidfRetriever
|
2 |
-
from haystack.document_stores import InMemoryDocumentStore
|
3 |
-
import spacy
|
4 |
-
import re
|
5 |
-
from spacy.matcher import Matcher
|
6 |
-
from markdown import markdown
|
7 |
-
from annotated_text import annotation
|
8 |
-
from haystack.schema import Document
|
9 |
-
from typing import List, Text, Tuple
|
10 |
-
from typing_extensions import Literal
|
11 |
-
from utils.preprocessing import processingpipeline
|
12 |
-
from utils.streamlitcheck import check_streamlit
|
13 |
-
import logging
|
14 |
-
try:
|
15 |
-
from termcolor import colored
|
16 |
-
except:
|
17 |
-
pass
|
18 |
-
|
19 |
-
try:
|
20 |
-
import streamlit as st
|
21 |
-
except ImportError:
|
22 |
-
logging.info("Streamlit not installed")
|
23 |
-
|
24 |
-
|
25 |
-
def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
|
26 |
-
split_by: Literal["sentence", "word"] = 'word',
|
27 |
-
split_length:int = 80, split_overlap:int = 0,
|
28 |
-
remove_punc:bool = False,)->List[Document]:
|
29 |
-
"""
|
30 |
-
creates the pipeline and runs the preprocessing pipeline,
|
31 |
-
the params for pipeline are fetched from paramconfig. As lexical doesnt gets
|
32 |
-
affected by overlap, threfore split_overlap = 0 in default paramconfig and
|
33 |
-
split_by = word.
|
34 |
-
|
35 |
-
Params
|
36 |
-
------------
|
37 |
-
|
38 |
-
file_name: filename, in case of streamlit application use
|
39 |
-
st.session_state['filename']
|
40 |
-
file_path: filepath, in case of streamlit application use
|
41 |
-
st.session_state['filepath']
|
42 |
-
split_by: document splitting strategy either as word or sentence
|
43 |
-
split_length: when synthetically creating the paragrpahs from document,
|
44 |
-
it defines the length of paragraph.
|
45 |
-
split_overlap: Number of words or sentences that overlap when creating
|
46 |
-
the paragraphs. This is done as one sentence or 'some words' make sense
|
47 |
-
when read in together with others. Therefore the overlap is used.
|
48 |
-
splititng of text.
|
49 |
-
removePunc: to remove all Punctuation including ',' and '.' or not
|
50 |
-
|
51 |
-
Return
|
52 |
-
--------------
|
53 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
54 |
-
has four objects. For the lexicaal search using TFIDFRetriever we
|
55 |
-
need to use the List of Haystack Document, which can be fetched by
|
56 |
-
key = 'documents' on output.
|
57 |
-
|
58 |
-
"""
|
59 |
-
|
60 |
-
lexical_processing_pipeline = processingpipeline()
|
61 |
-
|
62 |
-
|
63 |
-
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
64 |
-
params= {"FileConverter": {"file_path": file_path, \
|
65 |
-
"file_name": file_name},
|
66 |
-
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
67 |
-
"split_by": split_by, \
|
68 |
-
"split_length":split_length,\
|
69 |
-
"split_overlap": split_overlap}})
|
70 |
-
|
71 |
-
return output_lexical_pre
|
72 |
-
|
73 |
-
|
74 |
-
def tokenize_lexical_query(query:str)-> List[str]:
|
75 |
-
"""
|
76 |
-
Removes the stop words from query and returns the list of important keywords
|
77 |
-
in query. For the lexical search the relevent paragraphs in document are
|
78 |
-
retreived using TfIDFretreiver from Haystack. However to highlight these
|
79 |
-
keywords we need the tokenized form of query.
|
80 |
-
|
81 |
-
Params
|
82 |
-
--------
|
83 |
-
query: string which represents either list of keywords user is looking for
|
84 |
-
or a query in form of Question.
|
85 |
-
|
86 |
-
Return
|
87 |
-
-----------
|
88 |
-
token_list: list of important keywords in the query.
|
89 |
-
|
90 |
-
"""
|
91 |
-
nlp = spacy.load("en_core_web_sm")
|
92 |
-
token_list = [token.text.lower() for token in nlp(query)
|
93 |
-
if not (token.is_stop or token.is_punct)]
|
94 |
-
return token_list
|
95 |
-
|
96 |
-
def runSpacyMatcher(token_list:List[str], document:Text
|
97 |
-
)->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
|
98 |
-
"""
|
99 |
-
Using the spacy in backend finds the keywords in the document using the
|
100 |
-
Matcher class from spacy. We can alternatively use the regex, but spacy
|
101 |
-
finds all keywords in serialized manner which helps in annotation of answers.
|
102 |
-
|
103 |
-
Params
|
104 |
-
-------
|
105 |
-
token_list: this is token list which tokenize_lexical_query function returns
|
106 |
-
document: text in which we need to find the tokens
|
107 |
-
|
108 |
-
Return
|
109 |
-
--------
|
110 |
-
matches: List of [start_index, end_index] in the spacydoc(at word level not
|
111 |
-
character) for the keywords in token list.
|
112 |
-
|
113 |
-
spacydoc: the keyword index in the spacydoc are at word level and not character,
|
114 |
-
therefore to allow the annotator to work seamlessly we return the spacydoc.
|
115 |
-
|
116 |
-
"""
|
117 |
-
nlp = spacy.load("en_core_web_sm")
|
118 |
-
spacydoc = nlp(document)
|
119 |
-
matcher = Matcher(nlp.vocab)
|
120 |
-
token_pattern = [[{"LOWER":token}] for token in token_list]
|
121 |
-
matcher.add(",".join(token_list), token_pattern)
|
122 |
-
spacymatches = matcher(spacydoc)
|
123 |
-
|
124 |
-
# getting start and end index in spacydoc so that annotator can work seamlessly
|
125 |
-
matches = []
|
126 |
-
for match_id, start, end in spacymatches:
|
127 |
-
matches = matches + [[start, end]]
|
128 |
-
|
129 |
-
return matches, spacydoc
|
130 |
-
|
131 |
-
def runRegexMatcher(token_list:List[str], document:Text):
|
132 |
-
"""
|
133 |
-
Using the regex in backend finds the keywords in the document.
|
134 |
-
|
135 |
-
Params
|
136 |
-
-------
|
137 |
-
token_list: this is token list which tokenize_lexical_query function returns
|
138 |
-
|
139 |
-
document: text in which we need to find the tokens
|
140 |
-
|
141 |
-
Return
|
142 |
-
--------
|
143 |
-
matches: List of [start_index, end_index] in the document for the keywords
|
144 |
-
in token list at character level.
|
145 |
-
|
146 |
-
document: the keyword index returned by regex are at character level,
|
147 |
-
therefore to allow the annotator to work seamlessly we return the text back.
|
148 |
-
|
149 |
-
"""
|
150 |
-
matches = []
|
151 |
-
for token in token_list:
|
152 |
-
matches = (matches +
|
153 |
-
[[val.start(), val.start() +
|
154 |
-
len(token)] for val in re.finditer(token, document)])
|
155 |
-
|
156 |
-
return matches, document
|
157 |
-
|
158 |
-
def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
|
159 |
-
"""
|
160 |
-
This is spacy Annotator and needs spacy.doc
|
161 |
-
Annotates the text in the document defined by list of [start index, end index]
|
162 |
-
Example: "How are you today", if document type is text, matches = [[0,3]]
|
163 |
-
will give answer = "How", however in case we used the spacy matcher then the
|
164 |
-
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
165 |
-
to find "How" then the matches = [[0,1]] for the string defined above.
|
166 |
-
|
167 |
-
Params
|
168 |
-
-----------
|
169 |
-
matches: As mentioned its list of list. Example [[0,1],[10,13]]
|
170 |
-
document: document which needs to be indexed.
|
171 |
-
|
172 |
-
|
173 |
-
Return
|
174 |
-
--------
|
175 |
-
will send the output to either app front end using streamlit or
|
176 |
-
write directly to output screen.
|
177 |
-
|
178 |
-
"""
|
179 |
-
start = 0
|
180 |
-
annotated_text = ""
|
181 |
-
for match in matches:
|
182 |
-
start_idx = match[0]
|
183 |
-
end_idx = match[1]
|
184 |
-
|
185 |
-
if check_streamlit():
|
186 |
-
annotated_text = (annotated_text + document[start:start_idx].text
|
187 |
-
+ str(annotation(body=document[start_idx:end_idx].text,
|
188 |
-
label="ANSWER", background="#964448", color='#ffffff')))
|
189 |
-
else:
|
190 |
-
annotated_text = (annotated_text + document[start:start_idx].text
|
191 |
-
+ colored(document[start_idx:end_idx].text,
|
192 |
-
"green", attrs = ['bold']))
|
193 |
-
|
194 |
-
|
195 |
-
start = end_idx
|
196 |
-
|
197 |
-
annotated_text = annotated_text + document[end_idx:].text
|
198 |
-
|
199 |
-
|
200 |
-
if check_streamlit():
|
201 |
-
|
202 |
-
st.write(
|
203 |
-
markdown(annotated_text),
|
204 |
-
unsafe_allow_html=True,
|
205 |
-
)
|
206 |
-
else:
|
207 |
-
print(annotated_text)
|
208 |
-
|
209 |
-
def lexical_search(query:Text, documents:List[Document],top_k:int):
|
210 |
-
"""
|
211 |
-
Performs the Lexical search on the List of haystack documents which is
|
212 |
-
returned by preprocessing Pipeline.
|
213 |
-
|
214 |
-
Params
|
215 |
-
-------
|
216 |
-
query: Keywords that need to be searche in documents.
|
217 |
-
documents: List of Haystack documents returned by preprocessing pipeline.
|
218 |
-
top_k: Number of Top results to be fetched.
|
219 |
-
|
220 |
-
"""
|
221 |
-
|
222 |
-
document_store = InMemoryDocumentStore()
|
223 |
-
document_store.write_documents(documents)
|
224 |
-
|
225 |
-
# Haystack Retriever works with document stores only.
|
226 |
-
retriever = TfidfRetriever(document_store)
|
227 |
-
results = retriever.retrieve(query=query, top_k = top_k)
|
228 |
-
query_tokens = tokenize_lexical_query(query)
|
229 |
-
flag = True
|
230 |
-
for count, result in enumerate(results):
|
231 |
-
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
232 |
-
|
233 |
-
if len(matches) != 0:
|
234 |
-
if flag:
|
235 |
-
flag = False
|
236 |
-
if check_streamlit():
|
237 |
-
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
238 |
-
else:
|
239 |
-
print("Top few lexical search (TFIDF) hits")
|
240 |
-
|
241 |
-
if check_streamlit():
|
242 |
-
st.write("Result {}".format(count+1))
|
243 |
-
else:
|
244 |
-
print("Results {}".format(count +1))
|
245 |
-
spacyAnnotator(matches, doc)
|
246 |
-
|
247 |
-
if flag:
|
248 |
-
if check_streamlit():
|
249 |
-
st.info("🤔 No relevant result found. Please try another keyword.")
|
250 |
-
else:
|
251 |
-
print("No relevant result found. Please try another keyword.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/ndc_explorer.py
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
|
2 |
-
import urllib.request
|
3 |
-
import json
|
4 |
-
|
5 |
-
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
|
6 |
-
def get_document(country_code: str):
|
7 |
-
"""
|
8 |
-
read the country NDC data from
|
9 |
-
https://klimalog.die-gdi.de/ndc/open-data/dataset.json
|
10 |
-
using the country code.
|
11 |
-
|
12 |
-
Params
|
13 |
-
-------
|
14 |
-
country_code:"""
|
15 |
-
with urllib.request.urlopen(link) as urlfile:
|
16 |
-
data = json.loads(urlfile.read())
|
17 |
-
categoriesData = {}
|
18 |
-
categoriesData['categories']= data['categories']
|
19 |
-
categoriesData['subcategories']= data['subcategories']
|
20 |
-
keys_sub = categoriesData['subcategories'].keys()
|
21 |
-
documentType= 'NDCs'
|
22 |
-
if documentType in data.keys():
|
23 |
-
if country_code in data[documentType].keys():
|
24 |
-
get_dict = {}
|
25 |
-
for key, value in data[documentType][country_code].items():
|
26 |
-
if key not in ['country_name','region_id', 'region_name']:
|
27 |
-
get_dict[key] = value['classification']
|
28 |
-
else:
|
29 |
-
get_dict[key] = value
|
30 |
-
else:
|
31 |
-
return None
|
32 |
-
else:
|
33 |
-
return None
|
34 |
-
|
35 |
-
country = {}
|
36 |
-
for key in categoriesData['categories']:
|
37 |
-
country[key]= {}
|
38 |
-
for key,value in categoriesData['subcategories'].items():
|
39 |
-
country[value['category']][key] = get_dict[key]
|
40 |
-
|
41 |
-
return country
|
42 |
-
|
43 |
-
|
44 |
-
def countrySpecificCCA(cca_sent:dict, threshold:int, countryCode:str):
|
45 |
-
"""
|
46 |
-
based on the countrycode, reads the country data from
|
47 |
-
https://klimalog.die-gdi.de/ndc/open-data/dataset.json
|
48 |
-
using get_documents from utils.ndc_explorer.py
|
49 |
-
then based on thereshold value filters the Climate Change Adaptation
|
50 |
-
targets assigned by NDC explorer team to that country. Using the sentences
|
51 |
-
create by Data services team of GIZ for each target level, tries to find the
|
52 |
-
relevant passages from the document by doing the semantic search.
|
53 |
-
|
54 |
-
Params
|
55 |
-
-------
|
56 |
-
cca_sent: dictionary with key as 'target labels' and manufactured sentences
|
57 |
-
reflecting the target level. Please see the docStore/ndcs/cca.txt
|
58 |
-
|
59 |
-
threshold: NDC target have many categoriees ranging from [0-5], with 0
|
60 |
-
refelcting most relaxed attitude and 5 being most aggrisive towards Climate
|
61 |
-
change. We select the threshold value beyond which we need to focus on.
|
62 |
-
|
63 |
-
countryCode: standard country code to allow us to fetch the country specific
|
64 |
-
data.
|
65 |
-
|
66 |
-
"""
|
67 |
-
temp = {}
|
68 |
-
doc = get_document(countryCode)
|
69 |
-
for key,value in cca_sent.items():
|
70 |
-
id_ = doc['climate change adaptation'][key]['id']
|
71 |
-
if id_ >threshold:
|
72 |
-
temp[key] = value['id'][id_]
|
73 |
-
return temp
|
74 |
-
|
75 |
-
|
76 |
-
def countrySpecificCCM(ccm_sent, threshold, countryCode):
|
77 |
-
"""
|
78 |
-
see the documentation of countrySpecificCCA. This is same instead of
|
79 |
-
this gets the data pertaining to Adaptation
|
80 |
-
|
81 |
-
"""
|
82 |
-
|
83 |
-
temp = {}
|
84 |
-
doc = get_document(countryCode)
|
85 |
-
for key,value in ccm_sent.items():
|
86 |
-
id_ = doc['climate change mitigation'][key]['id']
|
87 |
-
if id_ >threshold:
|
88 |
-
temp[key] = value['id'][id_]
|
89 |
-
|
90 |
-
return temp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/preprocessing.py
DELETED
@@ -1,260 +0,0 @@
|
|
1 |
-
from haystack.nodes.base import BaseComponent
|
2 |
-
from haystack.schema import Document
|
3 |
-
from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
|
4 |
-
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
5 |
-
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
6 |
-
from typing_extensions import Literal
|
7 |
-
import pandas as pd
|
8 |
-
import logging
|
9 |
-
import re
|
10 |
-
import string
|
11 |
-
from haystack.pipelines import Pipeline
|
12 |
-
|
13 |
-
def useOCR(file_path: str)-> Text:
|
14 |
-
"""
|
15 |
-
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
16 |
-
|
17 |
-
Params
|
18 |
-
----------
|
19 |
-
file_path: file_path of uploade file, returned by add_upload function in
|
20 |
-
uploadAndExample.py
|
21 |
-
|
22 |
-
Returns the text file as string.
|
23 |
-
"""
|
24 |
-
|
25 |
-
|
26 |
-
converter = PDFToTextOCRConverter(remove_numeric_tables=True,
|
27 |
-
valid_languages=["eng"])
|
28 |
-
docs = converter.convert(file_path=file_path, meta=None)
|
29 |
-
return docs[0].content
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
class FileConverter(BaseComponent):
|
35 |
-
"""
|
36 |
-
Wrapper class to convert uploaded document into text by calling appropriate
|
37 |
-
Converter class, will use internally haystack PDFToTextOCR in case of image
|
38 |
-
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
39 |
-
label/output class for image.
|
40 |
-
|
41 |
-
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
42 |
-
2. https://docs.haystack.deepset.ai/docs/file_converters
|
43 |
-
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
44 |
-
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
45 |
-
|
46 |
-
|
47 |
-
"""
|
48 |
-
|
49 |
-
outgoing_edges = 1
|
50 |
-
|
51 |
-
def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
|
52 |
-
id_hash_keys: Optional[List[str]] = None,
|
53 |
-
) -> Tuple[dict,str]:
|
54 |
-
""" this is required method to invoke the component in
|
55 |
-
the pipeline implementation.
|
56 |
-
|
57 |
-
Params
|
58 |
-
----------
|
59 |
-
file_name: name of file
|
60 |
-
file_path: file_path of uploade file, returned by add_upload function in
|
61 |
-
uploadAndExample.py
|
62 |
-
|
63 |
-
See the links provided in Class docstring/description to see other params
|
64 |
-
|
65 |
-
Return
|
66 |
-
---------
|
67 |
-
output: dictionary, with key as identifier and value could be anything
|
68 |
-
we need to return. In this case its the List of Hasyatck Document
|
69 |
-
|
70 |
-
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
71 |
-
"""
|
72 |
-
try:
|
73 |
-
if file_name.endswith('.pdf'):
|
74 |
-
converter = PDFToTextConverter(remove_numeric_tables=True)
|
75 |
-
if file_name.endswith('.txt'):
|
76 |
-
converter = TextConverter(remove_numeric_tables=True)
|
77 |
-
if file_name.endswith('.docx'):
|
78 |
-
converter = DocxToTextConverter()
|
79 |
-
except Exception as e:
|
80 |
-
logging.error(e)
|
81 |
-
return
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
documents = []
|
86 |
-
|
87 |
-
document = converter.convert(
|
88 |
-
file_path=file_path, meta=None,
|
89 |
-
encoding=encoding, id_hash_keys=id_hash_keys
|
90 |
-
)[0]
|
91 |
-
|
92 |
-
text = document.content
|
93 |
-
|
94 |
-
# if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
|
95 |
-
# subsitute this substring with '',and check if content is empty string
|
96 |
-
|
97 |
-
text = re.sub(r'\x0c', '', text)
|
98 |
-
documents.append(Document(content=text,
|
99 |
-
meta={"name": file_name},
|
100 |
-
id_hash_keys=id_hash_keys))
|
101 |
-
|
102 |
-
|
103 |
-
# check if text is empty and apply pdfOCR converter.
|
104 |
-
for i in documents:
|
105 |
-
if i.content == "":
|
106 |
-
logging.info("Using OCR")
|
107 |
-
i.content = useOCR(file_path)
|
108 |
-
|
109 |
-
logging.info('file conversion succesful')
|
110 |
-
output = {'documents': documents}
|
111 |
-
return output, 'output_1'
|
112 |
-
|
113 |
-
def run_batch():
|
114 |
-
"""
|
115 |
-
we dont have requirement to process the multiple files in one go
|
116 |
-
therefore nothing here, however to use the custom node we need to have
|
117 |
-
this method for the class.
|
118 |
-
"""
|
119 |
-
|
120 |
-
return
|
121 |
-
|
122 |
-
|
123 |
-
def basic(s:str, remove_punc:bool = False):
|
124 |
-
|
125 |
-
"""
|
126 |
-
Performs basic cleaning of text.
|
127 |
-
|
128 |
-
Params
|
129 |
-
----------
|
130 |
-
s: string to be processed
|
131 |
-
removePunc: to remove all Punctuation including ',' and '.' or not
|
132 |
-
|
133 |
-
Returns: processed string: see comments in the source code for more info
|
134 |
-
"""
|
135 |
-
|
136 |
-
# Remove URLs
|
137 |
-
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
138 |
-
s = re.sub(r"http\S+", " ", s)
|
139 |
-
|
140 |
-
# Remove new line characters
|
141 |
-
s = re.sub('\n', ' ', s)
|
142 |
-
|
143 |
-
# Remove punctuations
|
144 |
-
if remove_punc == True:
|
145 |
-
translator = str.maketrans(' ', ' ', string.punctuation)
|
146 |
-
s = s.translate(translator)
|
147 |
-
# Remove distracting single quotes and dotted pattern
|
148 |
-
s = re.sub("\'", " ", s)
|
149 |
-
s = s.replace("..","")
|
150 |
-
|
151 |
-
return s.strip()
|
152 |
-
|
153 |
-
|
154 |
-
class UdfPreProcessor(BaseComponent):
|
155 |
-
"""
|
156 |
-
class to preprocess the document returned by FileConverter. It will check
|
157 |
-
for splitting strategy and splits the document by word or sentences and then
|
158 |
-
synthetically create the paragraphs.
|
159 |
-
|
160 |
-
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
161 |
-
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
162 |
-
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
163 |
-
|
164 |
-
"""
|
165 |
-
outgoing_edges = 1
|
166 |
-
|
167 |
-
def run(self, documents:List[Document], remove_punc:bool=False,
|
168 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
-
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
170 |
-
split_overlap:int = 0):
|
171 |
-
|
172 |
-
""" this is required method to invoke the component in
|
173 |
-
the pipeline implementation.
|
174 |
-
|
175 |
-
Params
|
176 |
-
----------
|
177 |
-
documents: documents from the output dictionary returned by Fileconverter
|
178 |
-
remove_punc: to remove all Punctuation including ',' and '.' or not
|
179 |
-
split_by: document splitting strategy either as word or sentence
|
180 |
-
split_length: when synthetically creating the paragrpahs from document,
|
181 |
-
it defines the length of paragraph.
|
182 |
-
split_respect_sentence_boundary: Used when using 'word' strategy for
|
183 |
-
splititng of text.
|
184 |
-
split_overlap: Number of words or sentences that overlap when creating
|
185 |
-
the paragraphs. This is done as one sentence or 'some words' make sense
|
186 |
-
when read in together with others. Therefore the overlap is used.
|
187 |
-
|
188 |
-
Return
|
189 |
-
---------
|
190 |
-
output: dictionary, with key as identifier and value could be anything
|
191 |
-
we need to return. In this case the output will contain 4 objects
|
192 |
-
the paragraphs text list as List, Haystack document, Dataframe and
|
193 |
-
one raw text file.
|
194 |
-
|
195 |
-
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
196 |
-
|
197 |
-
"""
|
198 |
-
|
199 |
-
if split_by == 'sentence':
|
200 |
-
split_respect_sentence_boundary = False
|
201 |
-
|
202 |
-
else:
|
203 |
-
split_respect_sentence_boundary = split_respect_sentence_boundary
|
204 |
-
|
205 |
-
preprocessor = PreProcessor(
|
206 |
-
clean_empty_lines=True,
|
207 |
-
clean_whitespace=True,
|
208 |
-
clean_header_footer=True,
|
209 |
-
split_by=split_by,
|
210 |
-
split_length=split_length,
|
211 |
-
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
212 |
-
split_overlap=split_overlap,
|
213 |
-
|
214 |
-
# will add page number only in case of PDF not for text/docx file.
|
215 |
-
add_page_number=True
|
216 |
-
)
|
217 |
-
|
218 |
-
for i in documents:
|
219 |
-
# # basic cleaning before passing it to preprocessor.
|
220 |
-
# i = basic(i)
|
221 |
-
docs_processed = preprocessor.process([i])
|
222 |
-
for item in docs_processed:
|
223 |
-
item.content = basic(item.content, remove_punc= remove_punc)
|
224 |
-
|
225 |
-
df = pd.DataFrame(docs_processed)
|
226 |
-
all_text = " ".join(df.content.to_list())
|
227 |
-
para_list = df.content.to_list()
|
228 |
-
logging.info('document split into {} paragraphs'.format(len(para_list)))
|
229 |
-
output = {'documents': docs_processed,
|
230 |
-
'dataframe': df,
|
231 |
-
'text': all_text,
|
232 |
-
'paraList': para_list
|
233 |
-
}
|
234 |
-
return output, "output_1"
|
235 |
-
def run_batch():
|
236 |
-
"""
|
237 |
-
we dont have requirement to process the multiple files in one go
|
238 |
-
therefore nothing here, however to use the custom node we need to have
|
239 |
-
this method for the class.
|
240 |
-
"""
|
241 |
-
return
|
242 |
-
|
243 |
-
def processingpipeline():
|
244 |
-
"""
|
245 |
-
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
246 |
-
from utils.preprocessing
|
247 |
-
|
248 |
-
"""
|
249 |
-
|
250 |
-
preprocessing_pipeline = Pipeline()
|
251 |
-
file_converter = FileConverter()
|
252 |
-
custom_preprocessor = UdfPreProcessor()
|
253 |
-
|
254 |
-
preprocessing_pipeline.add_node(component=file_converter,
|
255 |
-
name="FileConverter", inputs=["File"])
|
256 |
-
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
257 |
-
name ='UdfPreProcessor', inputs=["FileConverter"])
|
258 |
-
|
259 |
-
return preprocessing_pipeline
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/sdg_classifier.py
DELETED
@@ -1,177 +0,0 @@
|
|
1 |
-
from haystack.nodes import TransformersDocumentClassifier
|
2 |
-
from haystack.schema import Document
|
3 |
-
from typing import List, Tuple
|
4 |
-
from typing_extensions import Literal
|
5 |
-
import logging
|
6 |
-
import pandas as pd
|
7 |
-
from pandas import DataFrame, Series
|
8 |
-
from utils.checkconfig import getconfig
|
9 |
-
from utils.streamlitcheck import check_streamlit
|
10 |
-
from utils.preprocessing import processingpipeline
|
11 |
-
try:
|
12 |
-
import streamlit as st
|
13 |
-
except ImportError:
|
14 |
-
logging.info("Streamlit not installed")
|
15 |
-
|
16 |
-
## Labels dictionary ###
|
17 |
-
_lab_dict = {0: 'no_cat',
|
18 |
-
1:'SDG 1 - No poverty',
|
19 |
-
2:'SDG 2 - Zero hunger',
|
20 |
-
3:'SDG 3 - Good health and well-being',
|
21 |
-
4:'SDG 4 - Quality education',
|
22 |
-
5:'SDG 5 - Gender equality',
|
23 |
-
6:'SDG 6 - Clean water and sanitation',
|
24 |
-
7:'SDG 7 - Affordable and clean energy',
|
25 |
-
8:'SDG 8 - Decent work and economic growth',
|
26 |
-
9:'SDG 9 - Industry, Innovation and Infrastructure',
|
27 |
-
10:'SDG 10 - Reduced inequality',
|
28 |
-
11:'SDG 11 - Sustainable cities and communities',
|
29 |
-
12:'SDG 12 - Responsible consumption and production',
|
30 |
-
13:'SDG 13 - Climate action',
|
31 |
-
14:'SDG 14 - Life below water',
|
32 |
-
15:'SDG 15 - Life on land',
|
33 |
-
16:'SDG 16 - Peace, justice and strong institutions',
|
34 |
-
17:'SDG 17 - Partnership for the goals',}
|
35 |
-
|
36 |
-
@st.cache(allow_output_mutation=True)
|
37 |
-
def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
|
38 |
-
"""
|
39 |
-
loads the document classifier using haystack, where the name/path of model
|
40 |
-
in HF-hub as string is used to fetch the model object.Either configfile or
|
41 |
-
model should be passed.
|
42 |
-
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
43 |
-
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
44 |
-
|
45 |
-
Params
|
46 |
-
--------
|
47 |
-
config_file: config file path from which to read the model name
|
48 |
-
classifier_name: if modelname is passed, it takes a priority if not \
|
49 |
-
found then will look for configfile, else raise error.
|
50 |
-
|
51 |
-
|
52 |
-
Return: document classifier model
|
53 |
-
"""
|
54 |
-
if not classifier_name:
|
55 |
-
if not config_file:
|
56 |
-
logging.warning("Pass either model name or config file")
|
57 |
-
return
|
58 |
-
else:
|
59 |
-
config = getconfig(config_file)
|
60 |
-
classifier_name = config.get('sdg','MODEL')
|
61 |
-
|
62 |
-
logging.info("Loading classifier")
|
63 |
-
doc_classifier = TransformersDocumentClassifier(
|
64 |
-
model_name_or_path=classifier_name,
|
65 |
-
task="text-classification")
|
66 |
-
|
67 |
-
return doc_classifier
|
68 |
-
|
69 |
-
|
70 |
-
@st.cache(allow_output_mutation=True)
|
71 |
-
def sdg_classification(haystack_doc:List[Document],
|
72 |
-
threshold:float = 0.8,
|
73 |
-
classifier_model:TransformersDocumentClassifier= None
|
74 |
-
)->Tuple[DataFrame,Series]:
|
75 |
-
"""
|
76 |
-
Text-Classification on the list of texts provided. Classifier provides the
|
77 |
-
most appropriate label for each text. these labels are in terms of if text
|
78 |
-
belongs to which particular Sustainable Devleopment Goal (SDG).
|
79 |
-
|
80 |
-
Params
|
81 |
-
---------
|
82 |
-
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
83 |
-
contains the list of paragraphs in different format,here the list of
|
84 |
-
Haystack Documents is used.
|
85 |
-
threshold: threshold value for the model to keep the results from classifier
|
86 |
-
classifiermodel: you can pass the classifier model directly,which takes priority
|
87 |
-
however if not then looks for model in streamlit session.
|
88 |
-
In case of streamlit avoid passing the model directly.
|
89 |
-
|
90 |
-
|
91 |
-
Returns
|
92 |
-
----------
|
93 |
-
df: Dataframe with two columns['SDG:int', 'text']
|
94 |
-
x: Series object with the unique SDG covered in the document uploaded and
|
95 |
-
the number of times it is covered/discussed/count_of_paragraphs.
|
96 |
-
|
97 |
-
"""
|
98 |
-
logging.info("Working on SDG Classification")
|
99 |
-
if not classifier_model:
|
100 |
-
if check_streamlit():
|
101 |
-
classifier_model = st.session_state['sdg_classifier']
|
102 |
-
else:
|
103 |
-
logging.warning("No streamlit envinornment found, Pass the classifier")
|
104 |
-
return
|
105 |
-
|
106 |
-
results = classifier_model.predict(haystack_doc)
|
107 |
-
|
108 |
-
|
109 |
-
labels_= [(l.meta['classification']['label'],
|
110 |
-
l.meta['classification']['score'],l.content,) for l in results]
|
111 |
-
|
112 |
-
df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
|
113 |
-
|
114 |
-
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
115 |
-
df.index += 1
|
116 |
-
df =df[df['Relevancy']>threshold]
|
117 |
-
|
118 |
-
# creating the dataframe for value counts of SDG, along with 'title' of SDGs
|
119 |
-
x = df['SDG'].value_counts()
|
120 |
-
x = x.rename('count')
|
121 |
-
x = x.rename_axis('SDG').reset_index()
|
122 |
-
x["SDG"] = pd.to_numeric(x["SDG"])
|
123 |
-
x = x.sort_values(by=['count'], ascending=False)
|
124 |
-
x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
|
125 |
-
x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
|
126 |
-
|
127 |
-
df['SDG'] = pd.to_numeric(df['SDG'])
|
128 |
-
df = df.sort_values('SDG')
|
129 |
-
|
130 |
-
return df, x
|
131 |
-
|
132 |
-
def runSDGPreprocessingPipeline(file_name:str, file_path:str,
|
133 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
134 |
-
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
135 |
-
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
136 |
-
"""
|
137 |
-
creates the pipeline and runs the preprocessing pipeline,
|
138 |
-
the params for pipeline are fetched from paramconfig
|
139 |
-
|
140 |
-
Params
|
141 |
-
------------
|
142 |
-
|
143 |
-
file_name: filename, in case of streamlit application use
|
144 |
-
st.session_state['filename']
|
145 |
-
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
146 |
-
split_by: document splitting strategy either as word or sentence
|
147 |
-
split_length: when synthetically creating the paragrpahs from document,
|
148 |
-
it defines the length of paragraph.
|
149 |
-
split_respect_sentence_boundary: Used when using 'word' strategy for
|
150 |
-
splititng of text.
|
151 |
-
split_overlap: Number of words or sentences that overlap when creating
|
152 |
-
the paragraphs. This is done as one sentence or 'some words' make sense
|
153 |
-
when read in together with others. Therefore the overlap is used.
|
154 |
-
remove_punc: to remove all Punctuation including ',' and '.' or not
|
155 |
-
|
156 |
-
|
157 |
-
Return
|
158 |
-
--------------
|
159 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
160 |
-
has four objects. For the Haysatck implementation of SDG classification we,
|
161 |
-
need to use the List of Haystack Document, which can be fetched by
|
162 |
-
key = 'documents' on output.
|
163 |
-
|
164 |
-
"""
|
165 |
-
|
166 |
-
sdg_processing_pipeline = processingpipeline()
|
167 |
-
|
168 |
-
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
169 |
-
params= {"FileConverter": {"file_path": file_path, \
|
170 |
-
"file_name": file_name},
|
171 |
-
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
172 |
-
"split_by": split_by, \
|
173 |
-
"split_length":split_length,\
|
174 |
-
"split_overlap": split_overlap, \
|
175 |
-
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
176 |
-
|
177 |
-
return output_sdg_pre
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/semantic_search.py
DELETED
@@ -1,582 +0,0 @@
|
|
1 |
-
from haystack.nodes import TransformersQueryClassifier, Docs2Answers
|
2 |
-
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
-
from haystack.nodes.base import BaseComponent
|
4 |
-
from haystack.document_stores import InMemoryDocumentStore
|
5 |
-
from markdown import markdown
|
6 |
-
from annotated_text import annotation
|
7 |
-
from haystack.schema import Document
|
8 |
-
from typing import List, Text, Union
|
9 |
-
from typing_extensions import Literal
|
10 |
-
from utils.preprocessing import processingpipeline
|
11 |
-
from utils.streamlitcheck import check_streamlit
|
12 |
-
from haystack.pipelines import Pipeline
|
13 |
-
import pandas as pd
|
14 |
-
import logging
|
15 |
-
try:
|
16 |
-
from termcolor import colored
|
17 |
-
except:
|
18 |
-
pass
|
19 |
-
try:
|
20 |
-
import streamlit as st
|
21 |
-
except ImportError:
|
22 |
-
logging.info("Streamlit not installed")
|
23 |
-
|
24 |
-
|
25 |
-
@st.cache(allow_output_mutation=True)
|
26 |
-
def loadQueryClassifier():
|
27 |
-
"""
|
28 |
-
retuns the haystack query classifier model
|
29 |
-
model = shahrukhx01/bert-mini-finetune-question-detection
|
30 |
-
|
31 |
-
"""
|
32 |
-
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
33 |
-
"shahrukhx01/bert-mini-finetune-question-detection")
|
34 |
-
return query_classifier
|
35 |
-
|
36 |
-
class QueryCheck(BaseComponent):
|
37 |
-
"""
|
38 |
-
Uses Query Classifier from Haystack, process the query based on query type.
|
39 |
-
Ability to determine the statements is not so good, therefore the chances
|
40 |
-
statement also get modified. Ex: "List water related issues" will be
|
41 |
-
identified by the model as keywords, and therefore it be processed as "what
|
42 |
-
are the 'list all water related issues' related issues and discussions?".
|
43 |
-
This is one shortcoming but is igonred for now, as semantic search will not
|
44 |
-
get affected a lot, by this. If you want to pass keywords list and want to
|
45 |
-
do batch processing use. run_batch. Example: if you want to find relevant
|
46 |
-
passages for water, food security, poverty then querylist = ["water", "food
|
47 |
-
security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
48 |
-
|
49 |
-
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
50 |
-
|
51 |
-
"""
|
52 |
-
|
53 |
-
outgoing_edges = 1
|
54 |
-
|
55 |
-
def run(self, query:str):
|
56 |
-
"""
|
57 |
-
mandatory method to use the custom node. Determines the query type, if
|
58 |
-
if the query is of type keyword/statement will modify it to make it more
|
59 |
-
useful for sentence transoformers.
|
60 |
-
|
61 |
-
Params
|
62 |
-
--------
|
63 |
-
query: query/statement/keywords in form of string
|
64 |
-
|
65 |
-
Return
|
66 |
-
------
|
67 |
-
output: dictionary, with key as identifier and value could be anything
|
68 |
-
we need to return. In this case the output contain key = 'query'.
|
69 |
-
|
70 |
-
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
71 |
-
|
72 |
-
"""
|
73 |
-
query_classifier = loadQueryClassifier()
|
74 |
-
result = query_classifier.run(query=query)
|
75 |
-
|
76 |
-
if result[1] == "output_1":
|
77 |
-
output = {"query":query,
|
78 |
-
"query_type": 'question/statement'}
|
79 |
-
else:
|
80 |
-
output = {"query": "what are the {} related issues and \
|
81 |
-
discussions?".format(query),
|
82 |
-
"query_type": 'statements/keyword'}
|
83 |
-
logging.info(output)
|
84 |
-
return output, "output_1"
|
85 |
-
|
86 |
-
def run_batch(self, queries:List[str]):
|
87 |
-
"""
|
88 |
-
running multiple queries in one go, howeevr need the queries to be passed
|
89 |
-
as list of string. Example: if you want to find relevant passages for
|
90 |
-
water, food security, poverty then querylist = ["water", "food security",
|
91 |
-
"poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
92 |
-
|
93 |
-
Params
|
94 |
-
--------
|
95 |
-
queries: queries/statements/keywords in form of string encapsulated
|
96 |
-
within List
|
97 |
-
|
98 |
-
Return
|
99 |
-
------
|
100 |
-
output: dictionary, with key as identifier and value could be anything
|
101 |
-
we need to return. In this case the output contain key = 'queries'.
|
102 |
-
|
103 |
-
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
104 |
-
"""
|
105 |
-
query_classifier = loadQueryClassifier()
|
106 |
-
query_list = []
|
107 |
-
for query in queries:
|
108 |
-
result = query_classifier.run(query=query)
|
109 |
-
if result[1] == "output_1":
|
110 |
-
query_list.append(query)
|
111 |
-
else:
|
112 |
-
query_list.append("what are the {} related issues and \
|
113 |
-
discussions?".format(query))
|
114 |
-
output = {'queries':query_list}
|
115 |
-
logging.info(output)
|
116 |
-
return output, "output_1"
|
117 |
-
|
118 |
-
|
119 |
-
@st.cache(allow_output_mutation=True)
|
120 |
-
def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
|
121 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
122 |
-
split_length:int = 2, split_overlap:int = 0,
|
123 |
-
split_respect_sentence_boundary:bool = False,
|
124 |
-
remove_punc:bool = False)->List[Document]:
|
125 |
-
"""
|
126 |
-
creates the pipeline and runs the preprocessing pipeline.
|
127 |
-
|
128 |
-
Params
|
129 |
-
------------
|
130 |
-
|
131 |
-
file_name: filename, in case of streamlit application use
|
132 |
-
st.session_state['filename']
|
133 |
-
file_path: filepath, in case of streamlit application use
|
134 |
-
st.session_state['filepath']
|
135 |
-
split_by: document splitting strategy either as word or sentence
|
136 |
-
split_length: when synthetically creating the paragrpahs from document,
|
137 |
-
it defines the length of paragraph.
|
138 |
-
split_overlap: Number of words or sentences that overlap when creating the
|
139 |
-
paragraphs. This is done as one sentence or 'some words' make sense
|
140 |
-
when read in together with others. Therefore the overlap is used.
|
141 |
-
split_respect_sentence_boundary: Used when using 'word' strategy for
|
142 |
-
splititng of text.
|
143 |
-
remove_punc: to remove all Punctuation including ',' and '.' or not
|
144 |
-
|
145 |
-
Return
|
146 |
-
--------------
|
147 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
148 |
-
has four objects. For the Haysatck implementation of semantic search we,
|
149 |
-
need to use the List of Haystack Document, which can be fetched by
|
150 |
-
key = 'documents' on output.
|
151 |
-
|
152 |
-
"""
|
153 |
-
|
154 |
-
semantic_processing_pipeline = processingpipeline()
|
155 |
-
|
156 |
-
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
157 |
-
params= {"FileConverter": {"file_path": file_path, \
|
158 |
-
"file_name": file_name},
|
159 |
-
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
160 |
-
"split_by": split_by, \
|
161 |
-
"split_length":split_length,\
|
162 |
-
"split_overlap": split_overlap,
|
163 |
-
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
164 |
-
|
165 |
-
return output_semantic_pre
|
166 |
-
|
167 |
-
|
168 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
169 |
-
allow_output_mutation=True)
|
170 |
-
def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
|
171 |
-
embedding_layer:int = None, retriever_top_k:int = 10,
|
172 |
-
max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
|
173 |
-
"""
|
174 |
-
Returns the Retriever model based on params provided.
|
175 |
-
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
176 |
-
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
177 |
-
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
178 |
-
|
179 |
-
|
180 |
-
Params
|
181 |
-
---------
|
182 |
-
embedding_model: Name of the model to be used for embedding. Check the links
|
183 |
-
provided in documentation
|
184 |
-
embedding_model_format: check the github link of Haystack provided in
|
185 |
-
documentation embedding_layer: check the github link of Haystack
|
186 |
-
provided in documentation retriever_top_k: Number of Top results to
|
187 |
-
be returned by
|
188 |
-
retriever max_seq_len: everymodel has max seq len it can handle, check in
|
189 |
-
model card. Needed to hanlde the edge cases.
|
190 |
-
document_store: InMemoryDocumentStore, write haystack Document list to
|
191 |
-
DocumentStore and pass the same to function call. Can be done using
|
192 |
-
createDocumentStore from utils.
|
193 |
-
|
194 |
-
Return
|
195 |
-
-------
|
196 |
-
retriever: embedding model
|
197 |
-
"""
|
198 |
-
logging.info("loading retriever")
|
199 |
-
if document_store is None:
|
200 |
-
logging.warning("Retriever initialization requires the DocumentStore")
|
201 |
-
return
|
202 |
-
|
203 |
-
retriever = EmbeddingRetriever(
|
204 |
-
embedding_model=embedding_model,top_k = retriever_top_k,
|
205 |
-
document_store = document_store,
|
206 |
-
emb_extraction_layer=embedding_layer, scale_score =True,
|
207 |
-
model_format=embedding_model_format, use_gpu = True,
|
208 |
-
max_seq_len = max_seq_len )
|
209 |
-
if check_streamlit:
|
210 |
-
st.session_state['retriever'] = retriever
|
211 |
-
return retriever
|
212 |
-
|
213 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
214 |
-
allow_output_mutation=True)
|
215 |
-
def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
|
216 |
-
embedding_dim:int = 768):
|
217 |
-
"""
|
218 |
-
Creates the InMemory Document Store from haystack list of Documents.
|
219 |
-
It is mandatory component for Retriever to work in Haystack frame work.
|
220 |
-
|
221 |
-
Params
|
222 |
-
-------
|
223 |
-
documents: List of haystack document. If using the preprocessing pipeline,
|
224 |
-
can be fetched key = 'documents; on output of preprocessing pipeline.
|
225 |
-
similarity: scoring function, can be either 'cosine' or 'dot_product'
|
226 |
-
embedding_dim: Document store has default value of embedding size = 768, and
|
227 |
-
update_embeddings method of Docstore cannot infer the embedding size of
|
228 |
-
retiever automatically, therefore set this value as per the model card.
|
229 |
-
|
230 |
-
Return
|
231 |
-
-------
|
232 |
-
document_store: InMemory Document Store object type.
|
233 |
-
|
234 |
-
"""
|
235 |
-
document_store = InMemoryDocumentStore(similarity = similarity,
|
236 |
-
embedding_dim = embedding_dim )
|
237 |
-
document_store.write_documents(documents)
|
238 |
-
|
239 |
-
return document_store
|
240 |
-
|
241 |
-
|
242 |
-
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
243 |
-
allow_output_mutation=True)
|
244 |
-
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
245 |
-
embedding_model_format:Text = None,embedding_layer:int = None,
|
246 |
-
embedding_dim:int = 768,retriever_top_k:int = 10,
|
247 |
-
reader_model:str = None, reader_top_k:int = 10,
|
248 |
-
max_seq_len:int =512,useQueryCheck = True,
|
249 |
-
top_k_per_candidate:int = 1):
|
250 |
-
"""
|
251 |
-
creates the semantic search pipeline and document Store object from the
|
252 |
-
list of haystack documents. The top_k for the Reader and Retirever are kept
|
253 |
-
same, so that all the results returned by Retriever are used, however the
|
254 |
-
context is extracted by Reader for each retrieved result. The querycheck is
|
255 |
-
added as node to process the query. This pipeline is suited for keyword search,
|
256 |
-
and to some extent extractive QA purpose. The purpose of Reader is strictly to
|
257 |
-
highlight the context for retrieved result and not for QA, however as stated
|
258 |
-
it can work for QA too in limited sense.
|
259 |
-
There are 4 variants of pipeline it can return
|
260 |
-
1.QueryCheck > Retriever > Reader
|
261 |
-
2.Retriever > Reader
|
262 |
-
3.QueryCheck > Retriever > Docs2Answers : If reader is None,
|
263 |
-
then Doc2answer is used to keep the output of pipeline structurally same.
|
264 |
-
4.Retriever > Docs2Answers
|
265 |
-
|
266 |
-
Links
|
267 |
-
|
268 |
-
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
269 |
-
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
270 |
-
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
271 |
-
4. https://docs.haystack.deepset.ai/docs/reader
|
272 |
-
|
273 |
-
|
274 |
-
Params
|
275 |
-
----------
|
276 |
-
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
277 |
-
embedding_model: Name of the model to be used for embedding. Check the links
|
278 |
-
provided in documentation
|
279 |
-
embedding_model_format: check the github link of Haystack provided in
|
280 |
-
documentation
|
281 |
-
embedding_layer: check the github link of Haystack provided in documentation
|
282 |
-
embedding_dim: Document store has default value of embedding size = 768, and
|
283 |
-
update_embeddings method of Docstore cannot infer the embedding size of
|
284 |
-
retiever automatically, therefore set this value as per the model card.
|
285 |
-
retriever_top_k: Number of Top results to be returned by retriever
|
286 |
-
reader_model: Name of the model to be used for Reader node in hasyatck
|
287 |
-
Pipeline. Check the links provided in documentation
|
288 |
-
reader_top_k: Reader will use retrieved results to further find better matches.
|
289 |
-
As purpose here is to use reader to extract context, the value is
|
290 |
-
same as retriever_top_k.
|
291 |
-
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
292 |
-
Needed to hanlde the edge cases
|
293 |
-
useQueryCheck: Whether to use the querycheck which modifies the query or not.
|
294 |
-
top_k_per_candidate:How many answers to extract for each candidate doc
|
295 |
-
that is coming from the retriever
|
296 |
-
|
297 |
-
Return
|
298 |
-
---------
|
299 |
-
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
300 |
-
nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
|
301 |
-
then Doc2answer is used to keep the output of pipeline structurally
|
302 |
-
same.
|
303 |
-
|
304 |
-
document_store: As retriever can work only with Haystack Document Store, the
|
305 |
-
list of document returned by preprocessing pipeline are fed into to
|
306 |
-
get InMemmoryDocumentStore object type, with retriever updating the
|
307 |
-
embeddings of each paragraph in document store.
|
308 |
-
|
309 |
-
"""
|
310 |
-
document_store = createDocumentStore(documents=documents,
|
311 |
-
embedding_dim=embedding_dim)
|
312 |
-
retriever = loadRetriever(embedding_model = embedding_model,
|
313 |
-
embedding_model_format=embedding_model_format,
|
314 |
-
embedding_layer=embedding_layer,
|
315 |
-
retriever_top_k= retriever_top_k,
|
316 |
-
document_store = document_store,
|
317 |
-
max_seq_len=max_seq_len)
|
318 |
-
document_store.update_embeddings(retriever)
|
319 |
-
semantic_search_pipeline = Pipeline()
|
320 |
-
if useQueryCheck and reader_model:
|
321 |
-
querycheck = QueryCheck()
|
322 |
-
reader = FARMReader(model_name_or_path=reader_model,
|
323 |
-
top_k = reader_top_k, use_gpu=True,
|
324 |
-
top_k_per_candidate = top_k_per_candidate)
|
325 |
-
semantic_search_pipeline.add_node(component = querycheck,
|
326 |
-
name = "QueryCheck",inputs = ["Query"])
|
327 |
-
semantic_search_pipeline.add_node(component = retriever,
|
328 |
-
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
329 |
-
semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
|
330 |
-
inputs= ["EmbeddingRetriever"])
|
331 |
-
|
332 |
-
elif reader_model :
|
333 |
-
reader = FARMReader(model_name_or_path=reader_model,
|
334 |
-
top_k = reader_top_k, use_gpu=True,
|
335 |
-
top_k_per_candidate = top_k_per_candidate)
|
336 |
-
semantic_search_pipeline.add_node(component = retriever,
|
337 |
-
name = "EmbeddingRetriever",inputs = ["Query"])
|
338 |
-
semantic_search_pipeline.add_node(component = reader,
|
339 |
-
name = "FARMReader",inputs= ["EmbeddingRetriever"])
|
340 |
-
elif useQueryCheck and not reader_model:
|
341 |
-
querycheck = QueryCheck()
|
342 |
-
docs2answers = Docs2Answers()
|
343 |
-
semantic_search_pipeline.add_node(component = querycheck,
|
344 |
-
name = "QueryCheck",inputs = ["Query"])
|
345 |
-
semantic_search_pipeline.add_node(component = retriever,
|
346 |
-
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
347 |
-
semantic_search_pipeline.add_node(component = docs2answers,
|
348 |
-
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
349 |
-
elif not useQueryCheck and not reader_model:
|
350 |
-
docs2answers = Docs2Answers()
|
351 |
-
semantic_search_pipeline.add_node(component = retriever,
|
352 |
-
name = "EmbeddingRetriever",inputs = ["Query"])
|
353 |
-
semantic_search_pipeline.add_node(component = docs2answers,
|
354 |
-
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
355 |
-
|
356 |
-
logging.info(semantic_search_pipeline.components)
|
357 |
-
return semantic_search_pipeline, document_store
|
358 |
-
|
359 |
-
def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
|
360 |
-
"""
|
361 |
-
will use the haystack run or run_batch based on if single query is passed
|
362 |
-
as string or multiple queries as List[str]
|
363 |
-
|
364 |
-
Params
|
365 |
-
-------
|
366 |
-
pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
|
367 |
-
from utils.semanticsearch
|
368 |
-
|
369 |
-
queries: Either a single query or list of queries.
|
370 |
-
|
371 |
-
Return
|
372 |
-
-------
|
373 |
-
results: Dict containing answers and documents as key and their respective
|
374 |
-
values
|
375 |
-
|
376 |
-
"""
|
377 |
-
|
378 |
-
if type(queries) == list:
|
379 |
-
results = pipeline.run_batch(queries=queries)
|
380 |
-
elif type(queries) == str:
|
381 |
-
results = pipeline.run(query=queries)
|
382 |
-
else:
|
383 |
-
logging.info("Please check the input type for the queries")
|
384 |
-
return
|
385 |
-
|
386 |
-
return results
|
387 |
-
|
388 |
-
def process_query_output(results:dict)->pd.DataFrame:
|
389 |
-
"""
|
390 |
-
Returns the dataframe with necessary information like including
|
391 |
-
['query','answer','answer_offset','context_offset','context','content',
|
392 |
-
'reader_score','retriever_score','id',]. This is designed for output given
|
393 |
-
by semantic search pipeline with single query and final node as reader.
|
394 |
-
The output of pipeline having Docs2Answers as final node or multiple queries
|
395 |
-
need to be handled separately. In these other cases, use process_semantic_output
|
396 |
-
from utils.semantic_search which uses this function internally to make one
|
397 |
-
combined dataframe.
|
398 |
-
|
399 |
-
Params
|
400 |
-
---------
|
401 |
-
results: this dictionary should have key,values with
|
402 |
-
keys = [query,answers,documents], however answers is optional.
|
403 |
-
in case of [Doc2Answers as final node], process_semantic_output
|
404 |
-
doesnt return answers thereby setting all values contained in
|
405 |
-
answers to 'None'
|
406 |
-
|
407 |
-
Return
|
408 |
-
--------
|
409 |
-
df: dataframe with all the columns mentioned in function description.
|
410 |
-
|
411 |
-
"""
|
412 |
-
query_text = results['query']
|
413 |
-
if 'answers' in results.keys():
|
414 |
-
answer_dict = {}
|
415 |
-
|
416 |
-
for answer in results['answers']:
|
417 |
-
answer_dict[answer.document_id] = answer.to_dict()
|
418 |
-
else:
|
419 |
-
answer_dict = {}
|
420 |
-
docs = results['documents']
|
421 |
-
df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
|
422 |
-
'context','content','reader_score','retriever_score',
|
423 |
-
'id'])
|
424 |
-
for doc in docs:
|
425 |
-
row_list = {}
|
426 |
-
row_list['query'] = query_text
|
427 |
-
row_list['retriever_score'] = doc.score
|
428 |
-
row_list['id'] = doc.id
|
429 |
-
row_list['content'] = doc.content
|
430 |
-
if doc.id in answer_dict.keys():
|
431 |
-
row_list['answer'] = answer_dict[doc.id]['answer']
|
432 |
-
row_list['context'] = answer_dict[doc.id]['context']
|
433 |
-
row_list['reader_score'] = answer_dict[doc.id]['score']
|
434 |
-
answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
|
435 |
-
row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
|
436 |
-
start_idx = doc.content.find(row_list['context'])
|
437 |
-
end_idx = start_idx + len(row_list['context'])
|
438 |
-
row_list['context_offset'] = [start_idx, end_idx]
|
439 |
-
else:
|
440 |
-
row_list['answer'] = None
|
441 |
-
row_list['context'] = None
|
442 |
-
row_list['reader_score'] = None
|
443 |
-
row_list['answer_offset'] = None
|
444 |
-
row_list['context_offset'] = None
|
445 |
-
df_dictionary = pd.DataFrame([row_list])
|
446 |
-
df = pd.concat([df, df_dictionary], ignore_index=True)
|
447 |
-
|
448 |
-
return df
|
449 |
-
|
450 |
-
def process_semantic_output(results):
|
451 |
-
"""
|
452 |
-
Returns the dataframe with necessary information like including
|
453 |
-
['query','answer','answer_offset','context_offset','context','content',
|
454 |
-
'reader_score','retriever_score','id',]. Distingushes if its single query or
|
455 |
-
multi queries by reading the pipeline output dictionary keys.
|
456 |
-
Uses the process_query_output to get the dataframe for each query and create
|
457 |
-
one concataneted dataframe. In case of Docs2Answers as final node, deletes
|
458 |
-
the answers part. See documentations of process_query_output.
|
459 |
-
|
460 |
-
Params
|
461 |
-
---------
|
462 |
-
results: raw output of runSemanticPipeline.
|
463 |
-
|
464 |
-
Return
|
465 |
-
--------
|
466 |
-
df: dataframe with all the columns mentioned in function description.
|
467 |
-
|
468 |
-
"""
|
469 |
-
output = {}
|
470 |
-
if 'query' in results.keys():
|
471 |
-
output['query'] = results['query']
|
472 |
-
output['documents'] = results['documents']
|
473 |
-
if results['node_id'] == 'Docs2Answers':
|
474 |
-
pass
|
475 |
-
else:
|
476 |
-
output['answers'] = results['answers']
|
477 |
-
df = process_query_output(output)
|
478 |
-
return df
|
479 |
-
if 'queries' in results.keys():
|
480 |
-
df = pd.DataFrame(columns=['query','answer','answer_offset',
|
481 |
-
'context_offset','context','content',
|
482 |
-
'reader_score','retriever_score','id'])
|
483 |
-
for query,answers,documents in zip(results['queries'],
|
484 |
-
results['answers'],results['documents']):
|
485 |
-
output = {}
|
486 |
-
output['query'] = query
|
487 |
-
output['documents'] = documents
|
488 |
-
if results['node_id'] == 'Docs2Answers':
|
489 |
-
pass
|
490 |
-
else:
|
491 |
-
output['answers'] = answers
|
492 |
-
|
493 |
-
temp = process_query_output(output)
|
494 |
-
df = pd.concat([df, temp], ignore_index=True)
|
495 |
-
|
496 |
-
|
497 |
-
return df
|
498 |
-
|
499 |
-
def semanticsearchAnnotator(matches:List[List[int]], document:Text):
|
500 |
-
"""
|
501 |
-
Annotates the text in the document defined by list of [start index, end index]
|
502 |
-
Example: "How are you today", if document type is text, matches = [[0,3]]
|
503 |
-
will give answer = "How", however in case we used the spacy matcher then the
|
504 |
-
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
505 |
-
to find "How" then the matches = [[0,1]] for the string defined above.
|
506 |
-
|
507 |
-
"""
|
508 |
-
start = 0
|
509 |
-
annotated_text = ""
|
510 |
-
for match in matches:
|
511 |
-
start_idx = match[0]
|
512 |
-
end_idx = match[1]
|
513 |
-
if check_streamlit():
|
514 |
-
annotated_text = (annotated_text + document[start:start_idx]
|
515 |
-
+ str(annotation(body=document[start_idx:end_idx],
|
516 |
-
label="Context", background="#964448", color='#ffffff')))
|
517 |
-
else:
|
518 |
-
annotated_text = (annotated_text + document[start:start_idx]
|
519 |
-
+ colored(document[start_idx:end_idx],
|
520 |
-
"green", attrs = ['bold']))
|
521 |
-
start = end_idx
|
522 |
-
|
523 |
-
annotated_text = annotated_text + document[end_idx:]
|
524 |
-
|
525 |
-
if check_streamlit():
|
526 |
-
|
527 |
-
st.write(
|
528 |
-
markdown(annotated_text),
|
529 |
-
unsafe_allow_html=True,
|
530 |
-
)
|
531 |
-
else:
|
532 |
-
print(annotated_text)
|
533 |
-
|
534 |
-
|
535 |
-
def semantic_keywordsearch(query:Text,documents:List[Document],
|
536 |
-
embedding_model:Text,
|
537 |
-
embedding_model_format:Text,
|
538 |
-
embedding_layer:int, reader_model:str,
|
539 |
-
retriever_top_k:int = 10, reader_top_k:int = 10,
|
540 |
-
return_results:bool = False, embedding_dim:int = 768,
|
541 |
-
max_seq_len:int = 512,top_k_per_candidate:int =1,
|
542 |
-
sort_by:Literal["retriever", "reader"] = 'retriever'):
|
543 |
-
"""
|
544 |
-
Performs the Semantic search on the List of haystack documents which is
|
545 |
-
returned by preprocessing Pipeline.
|
546 |
-
|
547 |
-
Params
|
548 |
-
-------
|
549 |
-
query: Keywords that need to be searche in documents.
|
550 |
-
documents: List fo Haystack documents returned by preprocessing pipeline.
|
551 |
-
|
552 |
-
"""
|
553 |
-
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
|
554 |
-
embedding_model= embedding_model,
|
555 |
-
embedding_layer= embedding_layer,
|
556 |
-
embedding_model_format= embedding_model_format,
|
557 |
-
reader_model= reader_model, retriever_top_k= retriever_top_k,
|
558 |
-
reader_top_k= reader_top_k, embedding_dim=embedding_dim,
|
559 |
-
max_seq_len=max_seq_len,
|
560 |
-
top_k_per_candidate=top_k_per_candidate)
|
561 |
-
|
562 |
-
raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
|
563 |
-
results_df = process_semantic_output(raw_output)
|
564 |
-
if sort_by == 'retriever':
|
565 |
-
results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
|
566 |
-
else:
|
567 |
-
results_df = results_df.sort_values(by=['reader_score'], ascending=False)
|
568 |
-
|
569 |
-
if return_results:
|
570 |
-
return results_df
|
571 |
-
else:
|
572 |
-
if check_streamlit:
|
573 |
-
st.markdown("##### Top few semantic search results #####")
|
574 |
-
else:
|
575 |
-
print("Top few semantic search results")
|
576 |
-
for i in range(len(results_df)):
|
577 |
-
if check_streamlit:
|
578 |
-
st.write("Result {}".format(i+1))
|
579 |
-
else:
|
580 |
-
print("Result {}".format(i+1))
|
581 |
-
semanticsearchAnnotator([results_df.loc[i]['context_offset']],
|
582 |
-
results_df.loc[i]['content'] )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/streamlitcheck.py
DELETED
@@ -1,42 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
try:
|
3 |
-
import streamlit as st
|
4 |
-
except ImportError:
|
5 |
-
logging.info("Streamlit not installed")
|
6 |
-
|
7 |
-
|
8 |
-
def check_streamlit():
|
9 |
-
"""
|
10 |
-
Function to check whether python code is run within streamlit
|
11 |
-
|
12 |
-
Returns
|
13 |
-
-------
|
14 |
-
use_streamlit : boolean
|
15 |
-
True if code is run within streamlit, else False
|
16 |
-
"""
|
17 |
-
try:
|
18 |
-
from streamlit.scriptrunner.script_run_context import get_script_run_ctx
|
19 |
-
if not get_script_run_ctx():
|
20 |
-
use_streamlit = False
|
21 |
-
else:
|
22 |
-
use_streamlit = True
|
23 |
-
except ModuleNotFoundError:
|
24 |
-
use_streamlit = False
|
25 |
-
return use_streamlit
|
26 |
-
|
27 |
-
def disable_other_checkboxes(*other_checkboxes_keys):
|
28 |
-
for checkbox_key in other_checkboxes_keys:
|
29 |
-
st.session_state[checkbox_key] = False
|
30 |
-
|
31 |
-
def checkbox_without_preselect(keylist):
|
32 |
-
dict_ = {}
|
33 |
-
for i,key_val in enumerate(keylist):
|
34 |
-
dict_[i] = st.checkbox(key_val,key = key_val,
|
35 |
-
on_change = disable_other_checkboxes,
|
36 |
-
args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
|
37 |
-
|
38 |
-
for key,val in dict_.items():
|
39 |
-
if val == True:
|
40 |
-
return keylist[int(key)]
|
41 |
-
|
42 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/uploadAndExample.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import tempfile
|
3 |
-
import json
|
4 |
-
|
5 |
-
def add_upload(choice):
|
6 |
-
"""
|
7 |
-
Provdies the user with choice to either 'Upload Document' or 'Try Example'.
|
8 |
-
Based on user choice runs streamlit processes and save the path and name of
|
9 |
-
the 'file' to streamlit session_state which then can be fetched later.
|
10 |
-
|
11 |
-
"""
|
12 |
-
|
13 |
-
if choice == 'Upload Document':
|
14 |
-
uploaded_file = st.sidebar.file_uploader('Upload the File',
|
15 |
-
type=['pdf', 'docx', 'txt'])
|
16 |
-
if uploaded_file is not None:
|
17 |
-
with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
|
18 |
-
bytes_data = uploaded_file.getvalue()
|
19 |
-
temp.write(bytes_data)
|
20 |
-
st.session_state['filename'] = uploaded_file.name
|
21 |
-
st.session_state['filepath'] = temp.name
|
22 |
-
|
23 |
-
|
24 |
-
else:
|
25 |
-
# listing the options
|
26 |
-
with open('docStore/sample/files.json','r') as json_file:
|
27 |
-
files = json.load(json_file)
|
28 |
-
|
29 |
-
option = st.sidebar.selectbox('Select the example document',
|
30 |
-
list(files.keys()))
|
31 |
-
file_name = file_path = files[option]
|
32 |
-
st.session_state['filename'] = file_name
|
33 |
-
st.session_state['filepath'] = file_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|