Spaces:
GIZ
/
Running on CPU Upgrade

ppsingh commited on
Commit
b5e1233
·
verified ·
1 Parent(s): aa17801

Delete utils

Browse files
utils/__init__.py DELETED
@@ -1 +0,0 @@
1
- # adding for package implementation
 
 
utils/checkconfig.py DELETED
@@ -1,15 +0,0 @@
1
- import configparser
2
- import logging
3
-
4
- def getconfig(configfile_path:str):
5
- """
6
- configfile_path: file path of .cfg file
7
- """
8
-
9
- config = configparser.ConfigParser()
10
-
11
- try:
12
- config.read_file(open(configfile_path))
13
- return config
14
- except:
15
- logging.warning("config file not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/keyword_extraction.py DELETED
@@ -1,140 +0,0 @@
1
- import pandas as pd
2
- # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
3
- # import nltk
4
- # nltk.download('stopwords')
5
- # from nltk.corpus import stopwords
6
- import pickle
7
- from typing import List, Text
8
- import logging
9
- from summa import keywords
10
-
11
- try:
12
- import streamlit as st
13
- except ImportError:
14
- logging.info("Streamlit not installed")
15
-
16
-
17
- def sort_coo(coo_matrix):
18
- """
19
- It takes Coordinate format scipy sparse matrix and extracts info from same.\
20
- 1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
21
- """
22
- tuples = zip(coo_matrix.col, coo_matrix.data)
23
- return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
24
-
25
- def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
26
- """get the feature names and tf-idf score of top n items
27
-
28
- Params
29
- ---------
30
- feature_names: list of words from vectorizer
31
- sorted_items: tuple returned by sort_coo function defined in \
32
- keyword_extraction.py
33
- topn: topn words to be extracted using tfidf
34
-
35
- Return
36
- ----------
37
- results: top extracted keywords
38
-
39
- """
40
-
41
- #use only topn items from vector
42
- sorted_items = sorted_items[:top_n]
43
- score_vals = []
44
- feature_vals = []
45
-
46
- # word index and corresponding tf-idf score
47
- for idx, score in sorted_items:
48
-
49
- #keep track of feature name and its corresponding score
50
- score_vals.append(round(score, 3))
51
- feature_vals.append(feature_names[idx])
52
-
53
- results= {}
54
- for idx in range(len(feature_vals)):
55
- results[feature_vals[idx]]=score_vals[idx]
56
-
57
- return results
58
-
59
-
60
- def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
61
- """
62
- TFIDF based keywords extraction
63
-
64
- Params
65
- ---------
66
- vectorizer: trained cont vectorizer model
67
- tfidfmodel: TFIDF Tranformer model
68
- top_n: Top N keywords to be extracted
69
- textdata: text data to which needs keyword extraction
70
-
71
- Return
72
- ----------
73
- keywords: top extracted keywords
74
-
75
- """
76
- features = vectorizer.get_feature_names_out()
77
- tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
78
- sorted_items=sort_coo(tf_idf_vector.tocoo())
79
- results=extract_topn_from_vector(features,sorted_items,top_n)
80
- keywords = [keyword for keyword in results]
81
- return keywords
82
-
83
- def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
84
- """
85
- TFIDF based keywords extraction
86
-
87
- Params
88
- ---------
89
- sdg: which sdg tfidf model to be used
90
- sdgdata: text data to which needs keyword extraction
91
-
92
-
93
- Return
94
- ----------
95
- keywords: top extracted keywords
96
-
97
- """
98
- model_path = "docStore/sdg{}/".format(sdg)
99
- vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
100
- tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
101
- features = vectorizer.get_feature_names_out()
102
- tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
103
- sorted_items=sort_coo(tf_idf_vector.tocoo())
104
- top_n = top_n
105
- results=extract_topn_from_vector(features,sorted_items,top_n)
106
- keywords = [keyword for keyword in results]
107
- return keywords
108
-
109
- @st.cache(allow_output_mutation=True)
110
- def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
111
- """
112
- wrappper function to perform textrank, uses either ratio or wordcount to
113
- extract top keywords limited by words or ratio.
114
- 1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
115
-
116
- Params
117
- --------
118
- textdata: text data to perform the textrank.
119
- ratio: float to limit the number of keywords as proportion of total token \
120
- in textdata
121
- words: number of keywords to be extracted. Takes priority over ratio if \
122
- Non zero. Howevr incase the pagerank returns lesser keywords than \
123
- compared to fix value then ratio is used.
124
-
125
- Return
126
- --------
127
- results: extracted keywords
128
- """
129
- if words == 0:
130
- logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
131
- results = keywords.keywords(textdata, ratio= ratio).split("\n")
132
- else:
133
- try:
134
- results = keywords.keywords(textdata, words= words).split("\n")
135
- except:
136
- results = keywords.keywords(textdata, ratio = ratio).split("\n")
137
-
138
- return results
139
-
140
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/lexical_search.py DELETED
@@ -1,251 +0,0 @@
1
- from haystack.nodes import TfidfRetriever
2
- from haystack.document_stores import InMemoryDocumentStore
3
- import spacy
4
- import re
5
- from spacy.matcher import Matcher
6
- from markdown import markdown
7
- from annotated_text import annotation
8
- from haystack.schema import Document
9
- from typing import List, Text, Tuple
10
- from typing_extensions import Literal
11
- from utils.preprocessing import processingpipeline
12
- from utils.streamlitcheck import check_streamlit
13
- import logging
14
- try:
15
- from termcolor import colored
16
- except:
17
- pass
18
-
19
- try:
20
- import streamlit as st
21
- except ImportError:
22
- logging.info("Streamlit not installed")
23
-
24
-
25
- def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
26
- split_by: Literal["sentence", "word"] = 'word',
27
- split_length:int = 80, split_overlap:int = 0,
28
- remove_punc:bool = False,)->List[Document]:
29
- """
30
- creates the pipeline and runs the preprocessing pipeline,
31
- the params for pipeline are fetched from paramconfig. As lexical doesnt gets
32
- affected by overlap, threfore split_overlap = 0 in default paramconfig and
33
- split_by = word.
34
-
35
- Params
36
- ------------
37
-
38
- file_name: filename, in case of streamlit application use
39
- st.session_state['filename']
40
- file_path: filepath, in case of streamlit application use
41
- st.session_state['filepath']
42
- split_by: document splitting strategy either as word or sentence
43
- split_length: when synthetically creating the paragrpahs from document,
44
- it defines the length of paragraph.
45
- split_overlap: Number of words or sentences that overlap when creating
46
- the paragraphs. This is done as one sentence or 'some words' make sense
47
- when read in together with others. Therefore the overlap is used.
48
- splititng of text.
49
- removePunc: to remove all Punctuation including ',' and '.' or not
50
-
51
- Return
52
- --------------
53
- List[Document]: When preprocessing pipeline is run, the output dictionary
54
- has four objects. For the lexicaal search using TFIDFRetriever we
55
- need to use the List of Haystack Document, which can be fetched by
56
- key = 'documents' on output.
57
-
58
- """
59
-
60
- lexical_processing_pipeline = processingpipeline()
61
-
62
-
63
- output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
64
- params= {"FileConverter": {"file_path": file_path, \
65
- "file_name": file_name},
66
- "UdfPreProcessor": {"remove_punc": remove_punc, \
67
- "split_by": split_by, \
68
- "split_length":split_length,\
69
- "split_overlap": split_overlap}})
70
-
71
- return output_lexical_pre
72
-
73
-
74
- def tokenize_lexical_query(query:str)-> List[str]:
75
- """
76
- Removes the stop words from query and returns the list of important keywords
77
- in query. For the lexical search the relevent paragraphs in document are
78
- retreived using TfIDFretreiver from Haystack. However to highlight these
79
- keywords we need the tokenized form of query.
80
-
81
- Params
82
- --------
83
- query: string which represents either list of keywords user is looking for
84
- or a query in form of Question.
85
-
86
- Return
87
- -----------
88
- token_list: list of important keywords in the query.
89
-
90
- """
91
- nlp = spacy.load("en_core_web_sm")
92
- token_list = [token.text.lower() for token in nlp(query)
93
- if not (token.is_stop or token.is_punct)]
94
- return token_list
95
-
96
- def runSpacyMatcher(token_list:List[str], document:Text
97
- )->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
98
- """
99
- Using the spacy in backend finds the keywords in the document using the
100
- Matcher class from spacy. We can alternatively use the regex, but spacy
101
- finds all keywords in serialized manner which helps in annotation of answers.
102
-
103
- Params
104
- -------
105
- token_list: this is token list which tokenize_lexical_query function returns
106
- document: text in which we need to find the tokens
107
-
108
- Return
109
- --------
110
- matches: List of [start_index, end_index] in the spacydoc(at word level not
111
- character) for the keywords in token list.
112
-
113
- spacydoc: the keyword index in the spacydoc are at word level and not character,
114
- therefore to allow the annotator to work seamlessly we return the spacydoc.
115
-
116
- """
117
- nlp = spacy.load("en_core_web_sm")
118
- spacydoc = nlp(document)
119
- matcher = Matcher(nlp.vocab)
120
- token_pattern = [[{"LOWER":token}] for token in token_list]
121
- matcher.add(",".join(token_list), token_pattern)
122
- spacymatches = matcher(spacydoc)
123
-
124
- # getting start and end index in spacydoc so that annotator can work seamlessly
125
- matches = []
126
- for match_id, start, end in spacymatches:
127
- matches = matches + [[start, end]]
128
-
129
- return matches, spacydoc
130
-
131
- def runRegexMatcher(token_list:List[str], document:Text):
132
- """
133
- Using the regex in backend finds the keywords in the document.
134
-
135
- Params
136
- -------
137
- token_list: this is token list which tokenize_lexical_query function returns
138
-
139
- document: text in which we need to find the tokens
140
-
141
- Return
142
- --------
143
- matches: List of [start_index, end_index] in the document for the keywords
144
- in token list at character level.
145
-
146
- document: the keyword index returned by regex are at character level,
147
- therefore to allow the annotator to work seamlessly we return the text back.
148
-
149
- """
150
- matches = []
151
- for token in token_list:
152
- matches = (matches +
153
- [[val.start(), val.start() +
154
- len(token)] for val in re.finditer(token, document)])
155
-
156
- return matches, document
157
-
158
- def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
159
- """
160
- This is spacy Annotator and needs spacy.doc
161
- Annotates the text in the document defined by list of [start index, end index]
162
- Example: "How are you today", if document type is text, matches = [[0,3]]
163
- will give answer = "How", however in case we used the spacy matcher then the
164
- matches = [[0,3]] will give answer = "How are you". However if spacy is used
165
- to find "How" then the matches = [[0,1]] for the string defined above.
166
-
167
- Params
168
- -----------
169
- matches: As mentioned its list of list. Example [[0,1],[10,13]]
170
- document: document which needs to be indexed.
171
-
172
-
173
- Return
174
- --------
175
- will send the output to either app front end using streamlit or
176
- write directly to output screen.
177
-
178
- """
179
- start = 0
180
- annotated_text = ""
181
- for match in matches:
182
- start_idx = match[0]
183
- end_idx = match[1]
184
-
185
- if check_streamlit():
186
- annotated_text = (annotated_text + document[start:start_idx].text
187
- + str(annotation(body=document[start_idx:end_idx].text,
188
- label="ANSWER", background="#964448", color='#ffffff')))
189
- else:
190
- annotated_text = (annotated_text + document[start:start_idx].text
191
- + colored(document[start_idx:end_idx].text,
192
- "green", attrs = ['bold']))
193
-
194
-
195
- start = end_idx
196
-
197
- annotated_text = annotated_text + document[end_idx:].text
198
-
199
-
200
- if check_streamlit():
201
-
202
- st.write(
203
- markdown(annotated_text),
204
- unsafe_allow_html=True,
205
- )
206
- else:
207
- print(annotated_text)
208
-
209
- def lexical_search(query:Text, documents:List[Document],top_k:int):
210
- """
211
- Performs the Lexical search on the List of haystack documents which is
212
- returned by preprocessing Pipeline.
213
-
214
- Params
215
- -------
216
- query: Keywords that need to be searche in documents.
217
- documents: List of Haystack documents returned by preprocessing pipeline.
218
- top_k: Number of Top results to be fetched.
219
-
220
- """
221
-
222
- document_store = InMemoryDocumentStore()
223
- document_store.write_documents(documents)
224
-
225
- # Haystack Retriever works with document stores only.
226
- retriever = TfidfRetriever(document_store)
227
- results = retriever.retrieve(query=query, top_k = top_k)
228
- query_tokens = tokenize_lexical_query(query)
229
- flag = True
230
- for count, result in enumerate(results):
231
- matches, doc = runSpacyMatcher(query_tokens,result.content)
232
-
233
- if len(matches) != 0:
234
- if flag:
235
- flag = False
236
- if check_streamlit():
237
- st.markdown("##### Top few lexical search (TFIDF) hits #####")
238
- else:
239
- print("Top few lexical search (TFIDF) hits")
240
-
241
- if check_streamlit():
242
- st.write("Result {}".format(count+1))
243
- else:
244
- print("Results {}".format(count +1))
245
- spacyAnnotator(matches, doc)
246
-
247
- if flag:
248
- if check_streamlit():
249
- st.info("🤔 No relevant result found. Please try another keyword.")
250
- else:
251
- print("No relevant result found. Please try another keyword.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/ndc_explorer.py DELETED
@@ -1,90 +0,0 @@
1
-
2
- import urllib.request
3
- import json
4
-
5
- link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
6
- def get_document(country_code: str):
7
- """
8
- read the country NDC data from
9
- https://klimalog.die-gdi.de/ndc/open-data/dataset.json
10
- using the country code.
11
-
12
- Params
13
- -------
14
- country_code:"""
15
- with urllib.request.urlopen(link) as urlfile:
16
- data = json.loads(urlfile.read())
17
- categoriesData = {}
18
- categoriesData['categories']= data['categories']
19
- categoriesData['subcategories']= data['subcategories']
20
- keys_sub = categoriesData['subcategories'].keys()
21
- documentType= 'NDCs'
22
- if documentType in data.keys():
23
- if country_code in data[documentType].keys():
24
- get_dict = {}
25
- for key, value in data[documentType][country_code].items():
26
- if key not in ['country_name','region_id', 'region_name']:
27
- get_dict[key] = value['classification']
28
- else:
29
- get_dict[key] = value
30
- else:
31
- return None
32
- else:
33
- return None
34
-
35
- country = {}
36
- for key in categoriesData['categories']:
37
- country[key]= {}
38
- for key,value in categoriesData['subcategories'].items():
39
- country[value['category']][key] = get_dict[key]
40
-
41
- return country
42
-
43
-
44
- def countrySpecificCCA(cca_sent:dict, threshold:int, countryCode:str):
45
- """
46
- based on the countrycode, reads the country data from
47
- https://klimalog.die-gdi.de/ndc/open-data/dataset.json
48
- using get_documents from utils.ndc_explorer.py
49
- then based on thereshold value filters the Climate Change Adaptation
50
- targets assigned by NDC explorer team to that country. Using the sentences
51
- create by Data services team of GIZ for each target level, tries to find the
52
- relevant passages from the document by doing the semantic search.
53
-
54
- Params
55
- -------
56
- cca_sent: dictionary with key as 'target labels' and manufactured sentences
57
- reflecting the target level. Please see the docStore/ndcs/cca.txt
58
-
59
- threshold: NDC target have many categoriees ranging from [0-5], with 0
60
- refelcting most relaxed attitude and 5 being most aggrisive towards Climate
61
- change. We select the threshold value beyond which we need to focus on.
62
-
63
- countryCode: standard country code to allow us to fetch the country specific
64
- data.
65
-
66
- """
67
- temp = {}
68
- doc = get_document(countryCode)
69
- for key,value in cca_sent.items():
70
- id_ = doc['climate change adaptation'][key]['id']
71
- if id_ >threshold:
72
- temp[key] = value['id'][id_]
73
- return temp
74
-
75
-
76
- def countrySpecificCCM(ccm_sent, threshold, countryCode):
77
- """
78
- see the documentation of countrySpecificCCA. This is same instead of
79
- this gets the data pertaining to Adaptation
80
-
81
- """
82
-
83
- temp = {}
84
- doc = get_document(countryCode)
85
- for key,value in ccm_sent.items():
86
- id_ = doc['climate change mitigation'][key]['id']
87
- if id_ >threshold:
88
- temp[key] = value['id'][id_]
89
-
90
- return temp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/preprocessing.py DELETED
@@ -1,260 +0,0 @@
1
- from haystack.nodes.base import BaseComponent
2
- from haystack.schema import Document
3
- from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
- from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
- from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
- from typing_extensions import Literal
7
- import pandas as pd
8
- import logging
9
- import re
10
- import string
11
- from haystack.pipelines import Pipeline
12
-
13
- def useOCR(file_path: str)-> Text:
14
- """
15
- Converts image pdfs into text, Using the Farm-haystack[OCR]
16
-
17
- Params
18
- ----------
19
- file_path: file_path of uploade file, returned by add_upload function in
20
- uploadAndExample.py
21
-
22
- Returns the text file as string.
23
- """
24
-
25
-
26
- converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
- valid_languages=["eng"])
28
- docs = converter.convert(file_path=file_path, meta=None)
29
- return docs[0].content
30
-
31
-
32
-
33
-
34
- class FileConverter(BaseComponent):
35
- """
36
- Wrapper class to convert uploaded document into text by calling appropriate
37
- Converter class, will use internally haystack PDFToTextOCR in case of image
38
- pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
- label/output class for image.
40
-
41
- 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
- 2. https://docs.haystack.deepset.ai/docs/file_converters
43
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
- 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
-
46
-
47
- """
48
-
49
- outgoing_edges = 1
50
-
51
- def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
- id_hash_keys: Optional[List[str]] = None,
53
- ) -> Tuple[dict,str]:
54
- """ this is required method to invoke the component in
55
- the pipeline implementation.
56
-
57
- Params
58
- ----------
59
- file_name: name of file
60
- file_path: file_path of uploade file, returned by add_upload function in
61
- uploadAndExample.py
62
-
63
- See the links provided in Class docstring/description to see other params
64
-
65
- Return
66
- ---------
67
- output: dictionary, with key as identifier and value could be anything
68
- we need to return. In this case its the List of Hasyatck Document
69
-
70
- output_1: As there is only one outgoing edge, we pass 'output_1' string
71
- """
72
- try:
73
- if file_name.endswith('.pdf'):
74
- converter = PDFToTextConverter(remove_numeric_tables=True)
75
- if file_name.endswith('.txt'):
76
- converter = TextConverter(remove_numeric_tables=True)
77
- if file_name.endswith('.docx'):
78
- converter = DocxToTextConverter()
79
- except Exception as e:
80
- logging.error(e)
81
- return
82
-
83
-
84
-
85
- documents = []
86
-
87
- document = converter.convert(
88
- file_path=file_path, meta=None,
89
- encoding=encoding, id_hash_keys=id_hash_keys
90
- )[0]
91
-
92
- text = document.content
93
-
94
- # if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
95
- # subsitute this substring with '',and check if content is empty string
96
-
97
- text = re.sub(r'\x0c', '', text)
98
- documents.append(Document(content=text,
99
- meta={"name": file_name},
100
- id_hash_keys=id_hash_keys))
101
-
102
-
103
- # check if text is empty and apply pdfOCR converter.
104
- for i in documents:
105
- if i.content == "":
106
- logging.info("Using OCR")
107
- i.content = useOCR(file_path)
108
-
109
- logging.info('file conversion succesful')
110
- output = {'documents': documents}
111
- return output, 'output_1'
112
-
113
- def run_batch():
114
- """
115
- we dont have requirement to process the multiple files in one go
116
- therefore nothing here, however to use the custom node we need to have
117
- this method for the class.
118
- """
119
-
120
- return
121
-
122
-
123
- def basic(s:str, remove_punc:bool = False):
124
-
125
- """
126
- Performs basic cleaning of text.
127
-
128
- Params
129
- ----------
130
- s: string to be processed
131
- removePunc: to remove all Punctuation including ',' and '.' or not
132
-
133
- Returns: processed string: see comments in the source code for more info
134
- """
135
-
136
- # Remove URLs
137
- s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
- s = re.sub(r"http\S+", " ", s)
139
-
140
- # Remove new line characters
141
- s = re.sub('\n', ' ', s)
142
-
143
- # Remove punctuations
144
- if remove_punc == True:
145
- translator = str.maketrans(' ', ' ', string.punctuation)
146
- s = s.translate(translator)
147
- # Remove distracting single quotes and dotted pattern
148
- s = re.sub("\'", " ", s)
149
- s = s.replace("..","")
150
-
151
- return s.strip()
152
-
153
-
154
- class UdfPreProcessor(BaseComponent):
155
- """
156
- class to preprocess the document returned by FileConverter. It will check
157
- for splitting strategy and splits the document by word or sentences and then
158
- synthetically create the paragraphs.
159
-
160
- 1. https://docs.haystack.deepset.ai/docs/preprocessor
161
- 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
162
- 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
163
-
164
- """
165
- outgoing_edges = 1
166
-
167
- def run(self, documents:List[Document], remove_punc:bool=False,
168
- split_by: Literal["sentence", "word"] = 'sentence',
169
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
170
- split_overlap:int = 0):
171
-
172
- """ this is required method to invoke the component in
173
- the pipeline implementation.
174
-
175
- Params
176
- ----------
177
- documents: documents from the output dictionary returned by Fileconverter
178
- remove_punc: to remove all Punctuation including ',' and '.' or not
179
- split_by: document splitting strategy either as word or sentence
180
- split_length: when synthetically creating the paragrpahs from document,
181
- it defines the length of paragraph.
182
- split_respect_sentence_boundary: Used when using 'word' strategy for
183
- splititng of text.
184
- split_overlap: Number of words or sentences that overlap when creating
185
- the paragraphs. This is done as one sentence or 'some words' make sense
186
- when read in together with others. Therefore the overlap is used.
187
-
188
- Return
189
- ---------
190
- output: dictionary, with key as identifier and value could be anything
191
- we need to return. In this case the output will contain 4 objects
192
- the paragraphs text list as List, Haystack document, Dataframe and
193
- one raw text file.
194
-
195
- output_1: As there is only one outgoing edge, we pass 'output_1' string
196
-
197
- """
198
-
199
- if split_by == 'sentence':
200
- split_respect_sentence_boundary = False
201
-
202
- else:
203
- split_respect_sentence_boundary = split_respect_sentence_boundary
204
-
205
- preprocessor = PreProcessor(
206
- clean_empty_lines=True,
207
- clean_whitespace=True,
208
- clean_header_footer=True,
209
- split_by=split_by,
210
- split_length=split_length,
211
- split_respect_sentence_boundary= split_respect_sentence_boundary,
212
- split_overlap=split_overlap,
213
-
214
- # will add page number only in case of PDF not for text/docx file.
215
- add_page_number=True
216
- )
217
-
218
- for i in documents:
219
- # # basic cleaning before passing it to preprocessor.
220
- # i = basic(i)
221
- docs_processed = preprocessor.process([i])
222
- for item in docs_processed:
223
- item.content = basic(item.content, remove_punc= remove_punc)
224
-
225
- df = pd.DataFrame(docs_processed)
226
- all_text = " ".join(df.content.to_list())
227
- para_list = df.content.to_list()
228
- logging.info('document split into {} paragraphs'.format(len(para_list)))
229
- output = {'documents': docs_processed,
230
- 'dataframe': df,
231
- 'text': all_text,
232
- 'paraList': para_list
233
- }
234
- return output, "output_1"
235
- def run_batch():
236
- """
237
- we dont have requirement to process the multiple files in one go
238
- therefore nothing here, however to use the custom node we need to have
239
- this method for the class.
240
- """
241
- return
242
-
243
- def processingpipeline():
244
- """
245
- Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
246
- from utils.preprocessing
247
-
248
- """
249
-
250
- preprocessing_pipeline = Pipeline()
251
- file_converter = FileConverter()
252
- custom_preprocessor = UdfPreProcessor()
253
-
254
- preprocessing_pipeline.add_node(component=file_converter,
255
- name="FileConverter", inputs=["File"])
256
- preprocessing_pipeline.add_node(component = custom_preprocessor,
257
- name ='UdfPreProcessor', inputs=["FileConverter"])
258
-
259
- return preprocessing_pipeline
260
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/sdg_classifier.py DELETED
@@ -1,177 +0,0 @@
1
- from haystack.nodes import TransformersDocumentClassifier
2
- from haystack.schema import Document
3
- from typing import List, Tuple
4
- from typing_extensions import Literal
5
- import logging
6
- import pandas as pd
7
- from pandas import DataFrame, Series
8
- from utils.checkconfig import getconfig
9
- from utils.streamlitcheck import check_streamlit
10
- from utils.preprocessing import processingpipeline
11
- try:
12
- import streamlit as st
13
- except ImportError:
14
- logging.info("Streamlit not installed")
15
-
16
- ## Labels dictionary ###
17
- _lab_dict = {0: 'no_cat',
18
- 1:'SDG 1 - No poverty',
19
- 2:'SDG 2 - Zero hunger',
20
- 3:'SDG 3 - Good health and well-being',
21
- 4:'SDG 4 - Quality education',
22
- 5:'SDG 5 - Gender equality',
23
- 6:'SDG 6 - Clean water and sanitation',
24
- 7:'SDG 7 - Affordable and clean energy',
25
- 8:'SDG 8 - Decent work and economic growth',
26
- 9:'SDG 9 - Industry, Innovation and Infrastructure',
27
- 10:'SDG 10 - Reduced inequality',
28
- 11:'SDG 11 - Sustainable cities and communities',
29
- 12:'SDG 12 - Responsible consumption and production',
30
- 13:'SDG 13 - Climate action',
31
- 14:'SDG 14 - Life below water',
32
- 15:'SDG 15 - Life on land',
33
- 16:'SDG 16 - Peace, justice and strong institutions',
34
- 17:'SDG 17 - Partnership for the goals',}
35
-
36
- @st.cache(allow_output_mutation=True)
37
- def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
38
- """
39
- loads the document classifier using haystack, where the name/path of model
40
- in HF-hub as string is used to fetch the model object.Either configfile or
41
- model should be passed.
42
- 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
43
- 2. https://docs.haystack.deepset.ai/docs/document_classifier
44
-
45
- Params
46
- --------
47
- config_file: config file path from which to read the model name
48
- classifier_name: if modelname is passed, it takes a priority if not \
49
- found then will look for configfile, else raise error.
50
-
51
-
52
- Return: document classifier model
53
- """
54
- if not classifier_name:
55
- if not config_file:
56
- logging.warning("Pass either model name or config file")
57
- return
58
- else:
59
- config = getconfig(config_file)
60
- classifier_name = config.get('sdg','MODEL')
61
-
62
- logging.info("Loading classifier")
63
- doc_classifier = TransformersDocumentClassifier(
64
- model_name_or_path=classifier_name,
65
- task="text-classification")
66
-
67
- return doc_classifier
68
-
69
-
70
- @st.cache(allow_output_mutation=True)
71
- def sdg_classification(haystack_doc:List[Document],
72
- threshold:float = 0.8,
73
- classifier_model:TransformersDocumentClassifier= None
74
- )->Tuple[DataFrame,Series]:
75
- """
76
- Text-Classification on the list of texts provided. Classifier provides the
77
- most appropriate label for each text. these labels are in terms of if text
78
- belongs to which particular Sustainable Devleopment Goal (SDG).
79
-
80
- Params
81
- ---------
82
- haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
83
- contains the list of paragraphs in different format,here the list of
84
- Haystack Documents is used.
85
- threshold: threshold value for the model to keep the results from classifier
86
- classifiermodel: you can pass the classifier model directly,which takes priority
87
- however if not then looks for model in streamlit session.
88
- In case of streamlit avoid passing the model directly.
89
-
90
-
91
- Returns
92
- ----------
93
- df: Dataframe with two columns['SDG:int', 'text']
94
- x: Series object with the unique SDG covered in the document uploaded and
95
- the number of times it is covered/discussed/count_of_paragraphs.
96
-
97
- """
98
- logging.info("Working on SDG Classification")
99
- if not classifier_model:
100
- if check_streamlit():
101
- classifier_model = st.session_state['sdg_classifier']
102
- else:
103
- logging.warning("No streamlit envinornment found, Pass the classifier")
104
- return
105
-
106
- results = classifier_model.predict(haystack_doc)
107
-
108
-
109
- labels_= [(l.meta['classification']['label'],
110
- l.meta['classification']['score'],l.content,) for l in results]
111
-
112
- df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
113
-
114
- df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
115
- df.index += 1
116
- df =df[df['Relevancy']>threshold]
117
-
118
- # creating the dataframe for value counts of SDG, along with 'title' of SDGs
119
- x = df['SDG'].value_counts()
120
- x = x.rename('count')
121
- x = x.rename_axis('SDG').reset_index()
122
- x["SDG"] = pd.to_numeric(x["SDG"])
123
- x = x.sort_values(by=['count'], ascending=False)
124
- x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
125
- x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
126
-
127
- df['SDG'] = pd.to_numeric(df['SDG'])
128
- df = df.sort_values('SDG')
129
-
130
- return df, x
131
-
132
- def runSDGPreprocessingPipeline(file_name:str, file_path:str,
133
- split_by: Literal["sentence", "word"] = 'sentence',
134
- split_length:int = 2, split_respect_sentence_boundary:bool = False,
135
- split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
136
- """
137
- creates the pipeline and runs the preprocessing pipeline,
138
- the params for pipeline are fetched from paramconfig
139
-
140
- Params
141
- ------------
142
-
143
- file_name: filename, in case of streamlit application use
144
- st.session_state['filename']
145
- file_path: filepath, in case of streamlit application use st.session_state['filepath']
146
- split_by: document splitting strategy either as word or sentence
147
- split_length: when synthetically creating the paragrpahs from document,
148
- it defines the length of paragraph.
149
- split_respect_sentence_boundary: Used when using 'word' strategy for
150
- splititng of text.
151
- split_overlap: Number of words or sentences that overlap when creating
152
- the paragraphs. This is done as one sentence or 'some words' make sense
153
- when read in together with others. Therefore the overlap is used.
154
- remove_punc: to remove all Punctuation including ',' and '.' or not
155
-
156
-
157
- Return
158
- --------------
159
- List[Document]: When preprocessing pipeline is run, the output dictionary
160
- has four objects. For the Haysatck implementation of SDG classification we,
161
- need to use the List of Haystack Document, which can be fetched by
162
- key = 'documents' on output.
163
-
164
- """
165
-
166
- sdg_processing_pipeline = processingpipeline()
167
-
168
- output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
169
- params= {"FileConverter": {"file_path": file_path, \
170
- "file_name": file_name},
171
- "UdfPreProcessor": {"remove_punc": remove_punc, \
172
- "split_by": split_by, \
173
- "split_length":split_length,\
174
- "split_overlap": split_overlap, \
175
- "split_respect_sentence_boundary":split_respect_sentence_boundary}})
176
-
177
- return output_sdg_pre
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/semantic_search.py DELETED
@@ -1,582 +0,0 @@
1
- from haystack.nodes import TransformersQueryClassifier, Docs2Answers
2
- from haystack.nodes import EmbeddingRetriever, FARMReader
3
- from haystack.nodes.base import BaseComponent
4
- from haystack.document_stores import InMemoryDocumentStore
5
- from markdown import markdown
6
- from annotated_text import annotation
7
- from haystack.schema import Document
8
- from typing import List, Text, Union
9
- from typing_extensions import Literal
10
- from utils.preprocessing import processingpipeline
11
- from utils.streamlitcheck import check_streamlit
12
- from haystack.pipelines import Pipeline
13
- import pandas as pd
14
- import logging
15
- try:
16
- from termcolor import colored
17
- except:
18
- pass
19
- try:
20
- import streamlit as st
21
- except ImportError:
22
- logging.info("Streamlit not installed")
23
-
24
-
25
- @st.cache(allow_output_mutation=True)
26
- def loadQueryClassifier():
27
- """
28
- retuns the haystack query classifier model
29
- model = shahrukhx01/bert-mini-finetune-question-detection
30
-
31
- """
32
- query_classifier = TransformersQueryClassifier(model_name_or_path=
33
- "shahrukhx01/bert-mini-finetune-question-detection")
34
- return query_classifier
35
-
36
- class QueryCheck(BaseComponent):
37
- """
38
- Uses Query Classifier from Haystack, process the query based on query type.
39
- Ability to determine the statements is not so good, therefore the chances
40
- statement also get modified. Ex: "List water related issues" will be
41
- identified by the model as keywords, and therefore it be processed as "what
42
- are the 'list all water related issues' related issues and discussions?".
43
- This is one shortcoming but is igonred for now, as semantic search will not
44
- get affected a lot, by this. If you want to pass keywords list and want to
45
- do batch processing use. run_batch. Example: if you want to find relevant
46
- passages for water, food security, poverty then querylist = ["water", "food
47
- security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
48
-
49
- 1. https://docs.haystack.deepset.ai/docs/query_classifier
50
-
51
- """
52
-
53
- outgoing_edges = 1
54
-
55
- def run(self, query:str):
56
- """
57
- mandatory method to use the custom node. Determines the query type, if
58
- if the query is of type keyword/statement will modify it to make it more
59
- useful for sentence transoformers.
60
-
61
- Params
62
- --------
63
- query: query/statement/keywords in form of string
64
-
65
- Return
66
- ------
67
- output: dictionary, with key as identifier and value could be anything
68
- we need to return. In this case the output contain key = 'query'.
69
-
70
- output_1: As there is only one outgoing edge, we pass 'output_1' string
71
-
72
- """
73
- query_classifier = loadQueryClassifier()
74
- result = query_classifier.run(query=query)
75
-
76
- if result[1] == "output_1":
77
- output = {"query":query,
78
- "query_type": 'question/statement'}
79
- else:
80
- output = {"query": "what are the {} related issues and \
81
- discussions?".format(query),
82
- "query_type": 'statements/keyword'}
83
- logging.info(output)
84
- return output, "output_1"
85
-
86
- def run_batch(self, queries:List[str]):
87
- """
88
- running multiple queries in one go, howeevr need the queries to be passed
89
- as list of string. Example: if you want to find relevant passages for
90
- water, food security, poverty then querylist = ["water", "food security",
91
- "poverty"] and then execute QueryCheck.run_batch(queries = querylist)
92
-
93
- Params
94
- --------
95
- queries: queries/statements/keywords in form of string encapsulated
96
- within List
97
-
98
- Return
99
- ------
100
- output: dictionary, with key as identifier and value could be anything
101
- we need to return. In this case the output contain key = 'queries'.
102
-
103
- output_1: As there is only one outgoing edge, we pass 'output_1' string
104
- """
105
- query_classifier = loadQueryClassifier()
106
- query_list = []
107
- for query in queries:
108
- result = query_classifier.run(query=query)
109
- if result[1] == "output_1":
110
- query_list.append(query)
111
- else:
112
- query_list.append("what are the {} related issues and \
113
- discussions?".format(query))
114
- output = {'queries':query_list}
115
- logging.info(output)
116
- return output, "output_1"
117
-
118
-
119
- @st.cache(allow_output_mutation=True)
120
- def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
121
- split_by: Literal["sentence", "word"] = 'sentence',
122
- split_length:int = 2, split_overlap:int = 0,
123
- split_respect_sentence_boundary:bool = False,
124
- remove_punc:bool = False)->List[Document]:
125
- """
126
- creates the pipeline and runs the preprocessing pipeline.
127
-
128
- Params
129
- ------------
130
-
131
- file_name: filename, in case of streamlit application use
132
- st.session_state['filename']
133
- file_path: filepath, in case of streamlit application use
134
- st.session_state['filepath']
135
- split_by: document splitting strategy either as word or sentence
136
- split_length: when synthetically creating the paragrpahs from document,
137
- it defines the length of paragraph.
138
- split_overlap: Number of words or sentences that overlap when creating the
139
- paragraphs. This is done as one sentence or 'some words' make sense
140
- when read in together with others. Therefore the overlap is used.
141
- split_respect_sentence_boundary: Used when using 'word' strategy for
142
- splititng of text.
143
- remove_punc: to remove all Punctuation including ',' and '.' or not
144
-
145
- Return
146
- --------------
147
- List[Document]: When preprocessing pipeline is run, the output dictionary
148
- has four objects. For the Haysatck implementation of semantic search we,
149
- need to use the List of Haystack Document, which can be fetched by
150
- key = 'documents' on output.
151
-
152
- """
153
-
154
- semantic_processing_pipeline = processingpipeline()
155
-
156
- output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
157
- params= {"FileConverter": {"file_path": file_path, \
158
- "file_name": file_name},
159
- "UdfPreProcessor": {"remove_punc": remove_punc, \
160
- "split_by": split_by, \
161
- "split_length":split_length,\
162
- "split_overlap": split_overlap,
163
- "split_respect_sentence_boundary":split_respect_sentence_boundary}})
164
-
165
- return output_semantic_pre
166
-
167
-
168
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
169
- allow_output_mutation=True)
170
- def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
171
- embedding_layer:int = None, retriever_top_k:int = 10,
172
- max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
173
- """
174
- Returns the Retriever model based on params provided.
175
- 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
176
- 2. https://www.sbert.net/examples/applications/semantic-search/README.html
177
- 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
178
-
179
-
180
- Params
181
- ---------
182
- embedding_model: Name of the model to be used for embedding. Check the links
183
- provided in documentation
184
- embedding_model_format: check the github link of Haystack provided in
185
- documentation embedding_layer: check the github link of Haystack
186
- provided in documentation retriever_top_k: Number of Top results to
187
- be returned by
188
- retriever max_seq_len: everymodel has max seq len it can handle, check in
189
- model card. Needed to hanlde the edge cases.
190
- document_store: InMemoryDocumentStore, write haystack Document list to
191
- DocumentStore and pass the same to function call. Can be done using
192
- createDocumentStore from utils.
193
-
194
- Return
195
- -------
196
- retriever: embedding model
197
- """
198
- logging.info("loading retriever")
199
- if document_store is None:
200
- logging.warning("Retriever initialization requires the DocumentStore")
201
- return
202
-
203
- retriever = EmbeddingRetriever(
204
- embedding_model=embedding_model,top_k = retriever_top_k,
205
- document_store = document_store,
206
- emb_extraction_layer=embedding_layer, scale_score =True,
207
- model_format=embedding_model_format, use_gpu = True,
208
- max_seq_len = max_seq_len )
209
- if check_streamlit:
210
- st.session_state['retriever'] = retriever
211
- return retriever
212
-
213
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
214
- allow_output_mutation=True)
215
- def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
216
- embedding_dim:int = 768):
217
- """
218
- Creates the InMemory Document Store from haystack list of Documents.
219
- It is mandatory component for Retriever to work in Haystack frame work.
220
-
221
- Params
222
- -------
223
- documents: List of haystack document. If using the preprocessing pipeline,
224
- can be fetched key = 'documents; on output of preprocessing pipeline.
225
- similarity: scoring function, can be either 'cosine' or 'dot_product'
226
- embedding_dim: Document store has default value of embedding size = 768, and
227
- update_embeddings method of Docstore cannot infer the embedding size of
228
- retiever automatically, therefore set this value as per the model card.
229
-
230
- Return
231
- -------
232
- document_store: InMemory Document Store object type.
233
-
234
- """
235
- document_store = InMemoryDocumentStore(similarity = similarity,
236
- embedding_dim = embedding_dim )
237
- document_store.write_documents(documents)
238
-
239
- return document_store
240
-
241
-
242
- @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
243
- allow_output_mutation=True)
244
- def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
245
- embedding_model_format:Text = None,embedding_layer:int = None,
246
- embedding_dim:int = 768,retriever_top_k:int = 10,
247
- reader_model:str = None, reader_top_k:int = 10,
248
- max_seq_len:int =512,useQueryCheck = True,
249
- top_k_per_candidate:int = 1):
250
- """
251
- creates the semantic search pipeline and document Store object from the
252
- list of haystack documents. The top_k for the Reader and Retirever are kept
253
- same, so that all the results returned by Retriever are used, however the
254
- context is extracted by Reader for each retrieved result. The querycheck is
255
- added as node to process the query. This pipeline is suited for keyword search,
256
- and to some extent extractive QA purpose. The purpose of Reader is strictly to
257
- highlight the context for retrieved result and not for QA, however as stated
258
- it can work for QA too in limited sense.
259
- There are 4 variants of pipeline it can return
260
- 1.QueryCheck > Retriever > Reader
261
- 2.Retriever > Reader
262
- 3.QueryCheck > Retriever > Docs2Answers : If reader is None,
263
- then Doc2answer is used to keep the output of pipeline structurally same.
264
- 4.Retriever > Docs2Answers
265
-
266
- Links
267
-
268
- 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
269
- 2. https://www.sbert.net/examples/applications/semantic-search/README.html
270
- 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
271
- 4. https://docs.haystack.deepset.ai/docs/reader
272
-
273
-
274
- Params
275
- ----------
276
- documents: list of Haystack Documents, returned by preprocessig pipeline.
277
- embedding_model: Name of the model to be used for embedding. Check the links
278
- provided in documentation
279
- embedding_model_format: check the github link of Haystack provided in
280
- documentation
281
- embedding_layer: check the github link of Haystack provided in documentation
282
- embedding_dim: Document store has default value of embedding size = 768, and
283
- update_embeddings method of Docstore cannot infer the embedding size of
284
- retiever automatically, therefore set this value as per the model card.
285
- retriever_top_k: Number of Top results to be returned by retriever
286
- reader_model: Name of the model to be used for Reader node in hasyatck
287
- Pipeline. Check the links provided in documentation
288
- reader_top_k: Reader will use retrieved results to further find better matches.
289
- As purpose here is to use reader to extract context, the value is
290
- same as retriever_top_k.
291
- max_seq_len:everymodel has max seq len it can handle, check in model card.
292
- Needed to hanlde the edge cases
293
- useQueryCheck: Whether to use the querycheck which modifies the query or not.
294
- top_k_per_candidate:How many answers to extract for each candidate doc
295
- that is coming from the retriever
296
-
297
- Return
298
- ---------
299
- semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
300
- nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
301
- then Doc2answer is used to keep the output of pipeline structurally
302
- same.
303
-
304
- document_store: As retriever can work only with Haystack Document Store, the
305
- list of document returned by preprocessing pipeline are fed into to
306
- get InMemmoryDocumentStore object type, with retriever updating the
307
- embeddings of each paragraph in document store.
308
-
309
- """
310
- document_store = createDocumentStore(documents=documents,
311
- embedding_dim=embedding_dim)
312
- retriever = loadRetriever(embedding_model = embedding_model,
313
- embedding_model_format=embedding_model_format,
314
- embedding_layer=embedding_layer,
315
- retriever_top_k= retriever_top_k,
316
- document_store = document_store,
317
- max_seq_len=max_seq_len)
318
- document_store.update_embeddings(retriever)
319
- semantic_search_pipeline = Pipeline()
320
- if useQueryCheck and reader_model:
321
- querycheck = QueryCheck()
322
- reader = FARMReader(model_name_or_path=reader_model,
323
- top_k = reader_top_k, use_gpu=True,
324
- top_k_per_candidate = top_k_per_candidate)
325
- semantic_search_pipeline.add_node(component = querycheck,
326
- name = "QueryCheck",inputs = ["Query"])
327
- semantic_search_pipeline.add_node(component = retriever,
328
- name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
329
- semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
330
- inputs= ["EmbeddingRetriever"])
331
-
332
- elif reader_model :
333
- reader = FARMReader(model_name_or_path=reader_model,
334
- top_k = reader_top_k, use_gpu=True,
335
- top_k_per_candidate = top_k_per_candidate)
336
- semantic_search_pipeline.add_node(component = retriever,
337
- name = "EmbeddingRetriever",inputs = ["Query"])
338
- semantic_search_pipeline.add_node(component = reader,
339
- name = "FARMReader",inputs= ["EmbeddingRetriever"])
340
- elif useQueryCheck and not reader_model:
341
- querycheck = QueryCheck()
342
- docs2answers = Docs2Answers()
343
- semantic_search_pipeline.add_node(component = querycheck,
344
- name = "QueryCheck",inputs = ["Query"])
345
- semantic_search_pipeline.add_node(component = retriever,
346
- name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
347
- semantic_search_pipeline.add_node(component = docs2answers,
348
- name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
349
- elif not useQueryCheck and not reader_model:
350
- docs2answers = Docs2Answers()
351
- semantic_search_pipeline.add_node(component = retriever,
352
- name = "EmbeddingRetriever",inputs = ["Query"])
353
- semantic_search_pipeline.add_node(component = docs2answers,
354
- name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
355
-
356
- logging.info(semantic_search_pipeline.components)
357
- return semantic_search_pipeline, document_store
358
-
359
- def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
360
- """
361
- will use the haystack run or run_batch based on if single query is passed
362
- as string or multiple queries as List[str]
363
-
364
- Params
365
- -------
366
- pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
367
- from utils.semanticsearch
368
-
369
- queries: Either a single query or list of queries.
370
-
371
- Return
372
- -------
373
- results: Dict containing answers and documents as key and their respective
374
- values
375
-
376
- """
377
-
378
- if type(queries) == list:
379
- results = pipeline.run_batch(queries=queries)
380
- elif type(queries) == str:
381
- results = pipeline.run(query=queries)
382
- else:
383
- logging.info("Please check the input type for the queries")
384
- return
385
-
386
- return results
387
-
388
- def process_query_output(results:dict)->pd.DataFrame:
389
- """
390
- Returns the dataframe with necessary information like including
391
- ['query','answer','answer_offset','context_offset','context','content',
392
- 'reader_score','retriever_score','id',]. This is designed for output given
393
- by semantic search pipeline with single query and final node as reader.
394
- The output of pipeline having Docs2Answers as final node or multiple queries
395
- need to be handled separately. In these other cases, use process_semantic_output
396
- from utils.semantic_search which uses this function internally to make one
397
- combined dataframe.
398
-
399
- Params
400
- ---------
401
- results: this dictionary should have key,values with
402
- keys = [query,answers,documents], however answers is optional.
403
- in case of [Doc2Answers as final node], process_semantic_output
404
- doesnt return answers thereby setting all values contained in
405
- answers to 'None'
406
-
407
- Return
408
- --------
409
- df: dataframe with all the columns mentioned in function description.
410
-
411
- """
412
- query_text = results['query']
413
- if 'answers' in results.keys():
414
- answer_dict = {}
415
-
416
- for answer in results['answers']:
417
- answer_dict[answer.document_id] = answer.to_dict()
418
- else:
419
- answer_dict = {}
420
- docs = results['documents']
421
- df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
422
- 'context','content','reader_score','retriever_score',
423
- 'id'])
424
- for doc in docs:
425
- row_list = {}
426
- row_list['query'] = query_text
427
- row_list['retriever_score'] = doc.score
428
- row_list['id'] = doc.id
429
- row_list['content'] = doc.content
430
- if doc.id in answer_dict.keys():
431
- row_list['answer'] = answer_dict[doc.id]['answer']
432
- row_list['context'] = answer_dict[doc.id]['context']
433
- row_list['reader_score'] = answer_dict[doc.id]['score']
434
- answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
435
- row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
436
- start_idx = doc.content.find(row_list['context'])
437
- end_idx = start_idx + len(row_list['context'])
438
- row_list['context_offset'] = [start_idx, end_idx]
439
- else:
440
- row_list['answer'] = None
441
- row_list['context'] = None
442
- row_list['reader_score'] = None
443
- row_list['answer_offset'] = None
444
- row_list['context_offset'] = None
445
- df_dictionary = pd.DataFrame([row_list])
446
- df = pd.concat([df, df_dictionary], ignore_index=True)
447
-
448
- return df
449
-
450
- def process_semantic_output(results):
451
- """
452
- Returns the dataframe with necessary information like including
453
- ['query','answer','answer_offset','context_offset','context','content',
454
- 'reader_score','retriever_score','id',]. Distingushes if its single query or
455
- multi queries by reading the pipeline output dictionary keys.
456
- Uses the process_query_output to get the dataframe for each query and create
457
- one concataneted dataframe. In case of Docs2Answers as final node, deletes
458
- the answers part. See documentations of process_query_output.
459
-
460
- Params
461
- ---------
462
- results: raw output of runSemanticPipeline.
463
-
464
- Return
465
- --------
466
- df: dataframe with all the columns mentioned in function description.
467
-
468
- """
469
- output = {}
470
- if 'query' in results.keys():
471
- output['query'] = results['query']
472
- output['documents'] = results['documents']
473
- if results['node_id'] == 'Docs2Answers':
474
- pass
475
- else:
476
- output['answers'] = results['answers']
477
- df = process_query_output(output)
478
- return df
479
- if 'queries' in results.keys():
480
- df = pd.DataFrame(columns=['query','answer','answer_offset',
481
- 'context_offset','context','content',
482
- 'reader_score','retriever_score','id'])
483
- for query,answers,documents in zip(results['queries'],
484
- results['answers'],results['documents']):
485
- output = {}
486
- output['query'] = query
487
- output['documents'] = documents
488
- if results['node_id'] == 'Docs2Answers':
489
- pass
490
- else:
491
- output['answers'] = answers
492
-
493
- temp = process_query_output(output)
494
- df = pd.concat([df, temp], ignore_index=True)
495
-
496
-
497
- return df
498
-
499
- def semanticsearchAnnotator(matches:List[List[int]], document:Text):
500
- """
501
- Annotates the text in the document defined by list of [start index, end index]
502
- Example: "How are you today", if document type is text, matches = [[0,3]]
503
- will give answer = "How", however in case we used the spacy matcher then the
504
- matches = [[0,3]] will give answer = "How are you". However if spacy is used
505
- to find "How" then the matches = [[0,1]] for the string defined above.
506
-
507
- """
508
- start = 0
509
- annotated_text = ""
510
- for match in matches:
511
- start_idx = match[0]
512
- end_idx = match[1]
513
- if check_streamlit():
514
- annotated_text = (annotated_text + document[start:start_idx]
515
- + str(annotation(body=document[start_idx:end_idx],
516
- label="Context", background="#964448", color='#ffffff')))
517
- else:
518
- annotated_text = (annotated_text + document[start:start_idx]
519
- + colored(document[start_idx:end_idx],
520
- "green", attrs = ['bold']))
521
- start = end_idx
522
-
523
- annotated_text = annotated_text + document[end_idx:]
524
-
525
- if check_streamlit():
526
-
527
- st.write(
528
- markdown(annotated_text),
529
- unsafe_allow_html=True,
530
- )
531
- else:
532
- print(annotated_text)
533
-
534
-
535
- def semantic_keywordsearch(query:Text,documents:List[Document],
536
- embedding_model:Text,
537
- embedding_model_format:Text,
538
- embedding_layer:int, reader_model:str,
539
- retriever_top_k:int = 10, reader_top_k:int = 10,
540
- return_results:bool = False, embedding_dim:int = 768,
541
- max_seq_len:int = 512,top_k_per_candidate:int =1,
542
- sort_by:Literal["retriever", "reader"] = 'retriever'):
543
- """
544
- Performs the Semantic search on the List of haystack documents which is
545
- returned by preprocessing Pipeline.
546
-
547
- Params
548
- -------
549
- query: Keywords that need to be searche in documents.
550
- documents: List fo Haystack documents returned by preprocessing pipeline.
551
-
552
- """
553
- semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
554
- embedding_model= embedding_model,
555
- embedding_layer= embedding_layer,
556
- embedding_model_format= embedding_model_format,
557
- reader_model= reader_model, retriever_top_k= retriever_top_k,
558
- reader_top_k= reader_top_k, embedding_dim=embedding_dim,
559
- max_seq_len=max_seq_len,
560
- top_k_per_candidate=top_k_per_candidate)
561
-
562
- raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
563
- results_df = process_semantic_output(raw_output)
564
- if sort_by == 'retriever':
565
- results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
566
- else:
567
- results_df = results_df.sort_values(by=['reader_score'], ascending=False)
568
-
569
- if return_results:
570
- return results_df
571
- else:
572
- if check_streamlit:
573
- st.markdown("##### Top few semantic search results #####")
574
- else:
575
- print("Top few semantic search results")
576
- for i in range(len(results_df)):
577
- if check_streamlit:
578
- st.write("Result {}".format(i+1))
579
- else:
580
- print("Result {}".format(i+1))
581
- semanticsearchAnnotator([results_df.loc[i]['context_offset']],
582
- results_df.loc[i]['content'] )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/streamlitcheck.py DELETED
@@ -1,42 +0,0 @@
1
- import logging
2
- try:
3
- import streamlit as st
4
- except ImportError:
5
- logging.info("Streamlit not installed")
6
-
7
-
8
- def check_streamlit():
9
- """
10
- Function to check whether python code is run within streamlit
11
-
12
- Returns
13
- -------
14
- use_streamlit : boolean
15
- True if code is run within streamlit, else False
16
- """
17
- try:
18
- from streamlit.scriptrunner.script_run_context import get_script_run_ctx
19
- if not get_script_run_ctx():
20
- use_streamlit = False
21
- else:
22
- use_streamlit = True
23
- except ModuleNotFoundError:
24
- use_streamlit = False
25
- return use_streamlit
26
-
27
- def disable_other_checkboxes(*other_checkboxes_keys):
28
- for checkbox_key in other_checkboxes_keys:
29
- st.session_state[checkbox_key] = False
30
-
31
- def checkbox_without_preselect(keylist):
32
- dict_ = {}
33
- for i,key_val in enumerate(keylist):
34
- dict_[i] = st.checkbox(key_val,key = key_val,
35
- on_change = disable_other_checkboxes,
36
- args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
37
-
38
- for key,val in dict_.items():
39
- if val == True:
40
- return keylist[int(key)]
41
-
42
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/uploadAndExample.py DELETED
@@ -1,33 +0,0 @@
1
- import streamlit as st
2
- import tempfile
3
- import json
4
-
5
- def add_upload(choice):
6
- """
7
- Provdies the user with choice to either 'Upload Document' or 'Try Example'.
8
- Based on user choice runs streamlit processes and save the path and name of
9
- the 'file' to streamlit session_state which then can be fetched later.
10
-
11
- """
12
-
13
- if choice == 'Upload Document':
14
- uploaded_file = st.sidebar.file_uploader('Upload the File',
15
- type=['pdf', 'docx', 'txt'])
16
- if uploaded_file is not None:
17
- with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
18
- bytes_data = uploaded_file.getvalue()
19
- temp.write(bytes_data)
20
- st.session_state['filename'] = uploaded_file.name
21
- st.session_state['filepath'] = temp.name
22
-
23
-
24
- else:
25
- # listing the options
26
- with open('docStore/sample/files.json','r') as json_file:
27
- files = json.load(json_file)
28
-
29
- option = st.sidebar.selectbox('Select the example document',
30
- list(files.keys()))
31
- file_name = file_path = files[option]
32
- st.session_state['filename'] = file_name
33
- st.session_state['filepath'] = file_path