domdomingo ppsingh commited on
Commit
e1b1d60
·
0 Parent(s):

Duplicate from GIZ/SDSN-demo

Browse files

Co-authored-by: Prashant Singh <[email protected]>

.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zst filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ appStore/img/giz_sdsn.jpg filter=lfs diff=lfs merge=lfs -text
33
+ appStore/img/paris.png filter=lfs diff=lfs merge=lfs -text
34
+ appStore/img/pic1.png filter=lfs diff=lfs merge=lfs -text
Policy-Action-Tracker_Concept-Note.pdf ADDED
Binary file (154 kB). View file
 
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SDSN Demo
3
+ emoji: 📈
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: GIZ/SDSN-demo
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import appStore.keyword_search as keyword_search
2
+ import appStore.sdg_analysis as sdg_analysis
3
+ import appStore.coherence as coherence
4
+ import appStore.info as info
5
+ from appStore.multiapp import MultiApp
6
+ import streamlit as st
7
+
8
+ st.set_page_config(page_title = 'Climate Policy Intelligence',
9
+ initial_sidebar_state='expanded', layout="wide")
10
+
11
+ app = MultiApp()
12
+
13
+ app.add_app("About","house", info.app)
14
+ app.add_app("Search","search", keyword_search.app)
15
+ app.add_app("SDG Analysis","gear",sdg_analysis.app)
16
+ app.add_app("NDC Comparison","exclude", coherence.app)
17
+
18
+ app.run()
appStore/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # creating appstore package
appStore/coherence.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ import streamlit as st
6
+ import ast
7
+ import logging
8
+ from utils.ndc_explorer import countrySpecificCCA, countrySpecificCCM
9
+ from utils.checkconfig import getconfig
10
+ from utils.semantic_search import runSemanticPreprocessingPipeline,process_semantic_output
11
+ from utils.semantic_search import semanticSearchPipeline, runSemanticPipeline
12
+ from st_aggrid import AgGrid
13
+ from st_aggrid.shared import ColumnsAutoSizeMode
14
+
15
+ # Reading data and Declaring necessary variables
16
+ with open('docStore/ndcs/countryList.txt') as dfile:
17
+ countryList = dfile.read()
18
+ countryList = ast.literal_eval(countryList)
19
+ countrynames = list(countryList.keys())
20
+
21
+ with open('docStore/ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
22
+ cca_sent = dfile.read()
23
+ cca_sent = ast.literal_eval(cca_sent)
24
+
25
+ with open('docStore/ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
26
+ ccm_sent = dfile.read()
27
+ ccm_sent = ast.literal_eval(ccm_sent)
28
+
29
+ config = getconfig('paramconfig.cfg')
30
+ split_by = config.get('coherence','SPLIT_BY')
31
+ split_length = int(config.get('coherence','SPLIT_LENGTH'))
32
+ split_overlap = int(config.get('coherence','SPLIT_OVERLAP'))
33
+ split_respect_sentence_boundary = bool(int(config.get('coherence',
34
+ 'RESPECT_SENTENCE_BOUNDARY')))
35
+ remove_punc = bool(int(config.get('coherence','REMOVE_PUNC')))
36
+ embedding_model = config.get('coherence','RETRIEVER')
37
+ embedding_model_format = config.get('coherence','RETRIEVER_FORMAT')
38
+ embedding_layer = int(config.get('coherence','RETRIEVER_EMB_LAYER'))
39
+ embedding_dim = int(config.get('coherence','EMBEDDING_DIM'))
40
+ max_seq_len = int(config.get('coherence','MAX_SEQ_LENGTH'))
41
+ retriever_top_k = int(config.get('coherence','RETRIEVER_TOP_K'))
42
+
43
+
44
+
45
+ def app():
46
+
47
+ #### APP INFO #####
48
+ with st.container():
49
+ st.markdown("<h1 style='text-align: center; \
50
+ color: black;'> NDC Comparison</h1>",
51
+ unsafe_allow_html=True)
52
+ st.write(' ')
53
+ st.write(' ')
54
+ with st.expander("ℹ️ - About this app", expanded=False):
55
+
56
+ st.write(
57
+ """
58
+ The *NDC Comparison* application provides easy evaluation of
59
+ coherence between a given policy document and a country’s (Intended)\
60
+ Nationally Determined Contribution (INDCs/NDCs) using open-source \
61
+ data from the German Institute of Development and Sustainability’s \
62
+ (IDOS) [NDC Explorer](https://klimalog.idos-research.de/ndc/#NDCExplorer/worldMap?NewAndUpdatedNDC??income???catIncome).\
63
+ """)
64
+ st.write("")
65
+ st.write(""" User can select a country context via the drop-down menu \
66
+ on the left-hand side of the application. Subsequently, the user is \
67
+ given the opportunity to manually upload another policy document \
68
+ from the same national context or to select a pre-loaded example \
69
+ document. Thereafter, the user can choose between two categories \
70
+ to compare coherence between the documents: climate change adaptation \
71
+ and climate change mitigation. Based on the selected information, \
72
+ the application identifies relevant paragraphs in the uploaded \
73
+ document and assigns them to the respective indicator from the NDC \
74
+ Explorer. Currently, the NDC Explorer has 20 indicators under \
75
+ climate change mitigation (e.g., fossil fuel production, REDD+) and \
76
+ 22 indicators under climate change adaptation (e.g., sea level rise,\
77
+ investment needs). The assignment of the paragraph to a corresponding\
78
+ indicator is based on vector similarities in which top 3 results
79
+ if found are shown to the user. """)
80
+ st.write("")
81
+ st.write("")
82
+ st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
83
+ col1,col2= st.columns(2)
84
+ with col1:
85
+ st.caption("OCR File processing")
86
+ # st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
87
+ st.write("50 sec")
88
+
89
+ with col2:
90
+ st.caption("NDC comparison on 200 paragraphs(~ 35 pages)")
91
+ # st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
92
+ st.write("140 sec")
93
+
94
+ with st.sidebar:
95
+
96
+ option = st.selectbox('Select Country', (countrynames))
97
+ countryCode = countryList[option]
98
+ st.markdown("---")
99
+
100
+ genre = st.radio( "Select Category",('Climate Change Adaptation',
101
+ 'Climate Change Mitigation'))
102
+ st.markdown("---")
103
+
104
+ with st.container():
105
+ if st.button("Compare with NDC"):
106
+ sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
107
+ sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
108
+
109
+ if 'filepath' in st.session_state:
110
+ allDocuments = runSemanticPreprocessingPipeline(
111
+ file_path= st.session_state['filepath'],
112
+ file_name = st.session_state['filename'],
113
+ split_by=split_by,
114
+ split_length= split_length,
115
+ split_overlap=split_overlap,
116
+ remove_punc= remove_punc,
117
+ split_respect_sentence_boundary=split_respect_sentence_boundary)
118
+ # genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
119
+ if genre == 'Climate Change Adaptation':
120
+ sent_dict = sent_cca
121
+ else:
122
+ sent_dict = sent_ccm
123
+ sent_labels = []
124
+ for key,sent in sent_dict.items():
125
+ sent_labels.append(sent)
126
+ if len(allDocuments['documents']) > 100:
127
+ warning_msg = ": This might take sometime, please sit back and relax."
128
+ else:
129
+ warning_msg = ""
130
+ logging.info("starting Coherence analysis, \
131
+ country selected {}".format(option))
132
+ with st.spinner("Performing Coherence Analysis for {} \
133
+ under {} category{}".format(option,genre,warning_msg)):
134
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = allDocuments['documents'],
135
+ embedding_model= embedding_model,
136
+ embedding_layer= embedding_layer,
137
+ embedding_model_format= embedding_model_format,
138
+ retriever_top_k= retriever_top_k,
139
+ embedding_dim=embedding_dim,
140
+ max_seq_len=max_seq_len, useQueryCheck=False)
141
+ raw_output = runSemanticPipeline(pipeline=semanticsearch_pipeline,queries=sent_labels)
142
+ results_df = process_semantic_output(raw_output)
143
+ results_df = results_df.drop(['answer','answer_offset',
144
+ 'context_offset','context','reader_score','id'],
145
+ axis = 1)
146
+
147
+ for i,key in enumerate(list(sent_dict.keys())):
148
+ st.subheader("Relevant paragraphs for topic: {}".format(key))
149
+ df = results_df[results_df['query']==sent_dict[key]].reset_index(drop=True)
150
+ for j in range(3):
151
+ st.write('Result {}.'.format(j+1))
152
+ st.write(df.loc[j]['content']+'\n')
153
+
154
+ else:
155
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
156
+ logging.warning("Terminated as no document provided")
appStore/info.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def app():
4
+
5
+
6
+ with open('style.css') as f:
7
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
8
+
9
+ st.markdown("<h2 style='text-align: center; \
10
+ color: black;'> Policy Action Tracker</h2>",
11
+ unsafe_allow_html=True)
12
+
13
+
14
+ st.markdown("<div style='text-align: center; \
15
+ color: grey;'>The Policy Action Tracker is an open-source\
16
+ digital tool which aims to assist policy analysts and \
17
+ other users in extracting and filtering relevant \
18
+ information from policy documents.</div>",
19
+ unsafe_allow_html=True)
20
+ footer = """
21
+ <div class="footer-custom">
22
+ Guidance & Feedback - <a href="https://www.linkedin.com/in/maren-bernlöhr-149891222" target="_blank">Maren Bernlöhr</a> |
23
+ <a href="https://www.linkedin.com/in/manuelkuhm" target="_blank">Manuel Kuhm</a> |
24
+ Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
25
+ <a href="https://www.linkedin.com/in/jonas-nothnagel-bb42b114b/" target="_blank">Jonas Nothnagel</a> |
26
+ <a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
27
+
28
+ </div>
29
+ """
30
+ st.markdown(footer, unsafe_allow_html=True)
31
+
32
+ c1, c2, c3 = st.columns([8,1,12])
33
+ with c1:
34
+ st.image("docStore/img/ndc.png")
35
+ with c3:
36
+ st.markdown('<div style="text-align: justify;">The manual extraction \
37
+ of relevant information from text documents is a \
38
+ time-consuming task for any policy analyst. As the amount and length of \
39
+ public policy documents in relation to sustainable development (such as \
40
+ National Development Plans and Nationally Determined Contributions) \
41
+ continuously increases, a major challenge for policy action tracking – the \
42
+ evaluation of stated goals and targets and their actual implementation on \
43
+ the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
44
+ Language Processing (NLP) methods can help in shortening and easing this \
45
+ task for policy analysts.</div><br>',
46
+ unsafe_allow_html=True)
47
+
48
+ intro = """
49
+ <div style="text-align: justify;">
50
+
51
+ For this purpose, the United Nations Sustainable Development Solutions \
52
+ Network (SDSN) and the Deutsche Gesellschaft für Internationale \
53
+ Zusammenarbeit (GIZ) GmbH are collaborated in the development \
54
+ of this AI-powered open-source web application that helps find and extract \
55
+ relevant information from public policy documents faster to facilitate \
56
+ evidence-based decision-making processes in sustainable development and beyond.
57
+
58
+ This tool allows policy analysts and other users the possibility to rapidly \
59
+ search for relevant information/paragraphs in the document according to the \
60
+ user’s interest, classify the document’s content according to the Sustainable \
61
+ Development Goals (SDGs), and compare climate-related policy documents and NDCs \
62
+ across countries using open data from the German Institute of Development and \
63
+ Sustainability’s (IDOS) NDC Explorer.
64
+ To understand the application's functionalities and learn more about ß
65
+ the project, see the attached concept note. We hope you like our application 😊
66
+
67
+
68
+ </div>
69
+ <br>
70
+ """
71
+ st.markdown(intro, unsafe_allow_html=True)
72
+ # st.image("docStore/img/paris.png")
appStore/keyword_search.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ import streamlit as st
6
+ import json
7
+ import logging
8
+ from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
9
+ from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
10
+ from utils.checkconfig import getconfig
11
+ from utils.streamlitcheck import checkbox_without_preselect
12
+
13
+ # Declare all the necessary variables
14
+ config = getconfig('paramconfig.cfg')
15
+ split_by = config.get('semantic_search','SPLIT_BY')
16
+ split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
17
+ split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
18
+ split_respect_sentence_boundary = bool(int(config.get('semantic_search',
19
+ 'RESPECT_SENTENCE_BOUNDARY')))
20
+ remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
21
+ embedding_model = config.get('semantic_search','RETRIEVER')
22
+ embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
23
+ embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
24
+ embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
25
+ max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
26
+ retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
27
+ reader_model = config.get('semantic_search','READER')
28
+ reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
29
+ top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE'))
30
+ lexical_split_by= config.get('lexical_search','SPLIT_BY')
31
+ lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
32
+ lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
33
+ lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
34
+ lexical_top_k=int(config.get('lexical_search','TOP_K'))
35
+
36
+ def app():
37
+
38
+ with st.container():
39
+ st.markdown("<h1 style='text-align: center; \
40
+ color: black;'> Search</h1>",
41
+ unsafe_allow_html=True)
42
+ st.write(' ')
43
+ st.write(' ')
44
+
45
+ with st.expander("ℹ️ - About this app", expanded=False):
46
+
47
+ st.write(
48
+ """
49
+ The *Search* app is an interface \
50
+ for doing contextual and keyword searches in \
51
+ policy documents. \
52
+ """)
53
+ st.write("")
54
+ st.write(""" The application allows its user to perform a search\
55
+ based on two options: a lexical search([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
56
+ and semantic search. [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\
57
+ The lexical search only \
58
+ displays paragraphs in the document with exact matching results, \
59
+ the semantic search shows paragraphs with meaningful connections \
60
+ (e.g., synonyms) based on the search context. Both \
61
+ methods employ a probabilistic retrieval framework in its identification\
62
+ of relevant paragraphs. By defualt the search is performed using \
63
+ 'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
64
+ checkbox provided which will by-pass semantic search. Furthermore,\
65
+ the application allows the user to search for pre-defined keywords \
66
+ from different thematic buckets present in sidebar.""")
67
+ st.write("")
68
+ st.write(""" The Exact Matches gives back top {} findings, and Semantic
69
+ search provides with top {} answers.""".format(lexical_top_k, retriever_top_k))
70
+ st.write("")
71
+ st.write("")
72
+ st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
73
+ col1,col2,col3= st.columns([2,4,4])
74
+ with col1:
75
+ st.caption("OCR File processing")
76
+ # st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
77
+ st.write("50 sec")
78
+
79
+ with col2:
80
+ st.caption("Lexical Search on 200 paragraphs(~ 35 pages)")
81
+ # st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
82
+ st.write("15 sec")
83
+
84
+ with col3:
85
+ st.caption("Semantic search on 200 paragraphs(~ 35 pages)")
86
+ # st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
87
+ st.write("120 sec(including emebedding creation)")
88
+
89
+ with st.sidebar:
90
+ with open('docStore/sample/keywordexample.json','r') as json_file:
91
+ keywordexample = json.load(json_file)
92
+
93
+ # genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
94
+ st.caption("Select Keyword Category")
95
+ genre = checkbox_without_preselect(list(keywordexample.keys()))
96
+ if genre:
97
+ keywordList = keywordexample[genre]
98
+ else:
99
+ keywordList = None
100
+
101
+ st.markdown("---")
102
+
103
+ with st.container():
104
+ type_hinting = "Please enter here your question and we \
105
+ will look for an answer in the document\
106
+ OR enter the keyword you are looking \
107
+ for and we will look for similar\
108
+ context in the document.\
109
+ You can also explore predefined sets of keywords from sidebar. "
110
+ if keywordList is not None:
111
+ # queryList = st.text_input("You selected the {} category we \
112
+ # will look for these keywords in document".format(genre)
113
+ # value="{}".format(keywordList))
114
+ queryList = st.text_input(type_hinting,
115
+ value = "{}".format(keywordList))
116
+ else:
117
+ queryList = st.text_input(type_hinting,
118
+ placeholder="Enter keyword/query here")
119
+
120
+ searchtype = st.checkbox("Show only Exact Matches")
121
+ if st.button("Find them"):
122
+
123
+ if queryList == "":
124
+ st.info("🤔 No keyword provided, if you dont have any, \
125
+ please try example sets from sidebar!")
126
+ logging.warning("Terminated as no keyword provided")
127
+ else:
128
+ if 'filepath' in st.session_state:
129
+
130
+ if searchtype:
131
+ all_documents = runLexicalPreprocessingPipeline(
132
+ file_name=st.session_state['filename'],
133
+ file_path=st.session_state['filepath'],
134
+ split_by=lexical_split_by,
135
+ split_length=lexical_split_length,
136
+ split_overlap=lexical_split_overlap,
137
+ remove_punc=lexical_remove_punc)
138
+ logging.info("performing lexical search")
139
+ with st.spinner("Performing Exact matching search \
140
+ (Lexical search) for you"):
141
+ lexical_search(query=queryList,
142
+ documents = all_documents['documents'],
143
+ top_k = lexical_top_k )
144
+ else:
145
+ all_documents = runSemanticPreprocessingPipeline(
146
+ file_path= st.session_state['filepath'],
147
+ file_name = st.session_state['filename'],
148
+ split_by=split_by,
149
+ split_length= split_length,
150
+ split_overlap=split_overlap,
151
+ remove_punc= remove_punc,
152
+ split_respect_sentence_boundary=split_respect_sentence_boundary)
153
+ if len(all_documents['documents']) > 100:
154
+ warning_msg = ": This might take sometime, please sit back and relax."
155
+ else:
156
+ warning_msg = ""
157
+
158
+ logging.info("starting semantic search")
159
+ with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
160
+ semantic_keywordsearch(query = queryList,
161
+ documents = all_documents['documents'],
162
+ embedding_model=embedding_model,
163
+ embedding_layer=embedding_layer,
164
+ embedding_model_format=embedding_model_format,
165
+ reader_model=reader_model,reader_top_k=reader_top_k,
166
+ retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
167
+ max_seq_len=max_seq_len,
168
+ top_k_per_candidate = top_k_per_candidate)
169
+
170
+ else:
171
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
172
+ logging.warning("Terminated as no document provided")
173
+
174
+
175
+
176
+
appStore/multiapp.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frameworks for running multiple Streamlit applications as a single app.
2
+ """
3
+ import streamlit as st
4
+ from PIL import Image
5
+ from streamlit_option_menu import option_menu
6
+ from utils.uploadAndExample import add_upload
7
+
8
+ class MultiApp:
9
+ """Framework for combining multiple streamlit applications.
10
+ Usage:
11
+ def foo():
12
+ st.title("Hello Foo")
13
+ def bar():
14
+ st.title("Hello Bar")
15
+ app = MultiApp()
16
+ app.add_app("Foo", foo)
17
+ app.add_app("Bar", bar)
18
+ app.run()
19
+ It is also possible keep each application in a separate file.
20
+ import foo
21
+ import bar
22
+ app = MultiApp()
23
+ app.add_app("Foo", foo.app)
24
+ app.add_app("Bar", bar.app)
25
+ app.run()
26
+ """
27
+ def __init__(self):
28
+ self.apps = []
29
+
30
+ def add_app(self,title,icon, func):
31
+ """Adds a new application.
32
+ Parameters
33
+ ----------
34
+ func:
35
+ the python function to render this app.
36
+ title:
37
+ title of the app. Appears in the dropdown in the sidebar.
38
+ """
39
+ self.apps.append({
40
+ "title": title,
41
+ "icon": icon,
42
+ "function": func
43
+ })
44
+
45
+ def run(self):
46
+
47
+ st.sidebar.write(format_func=lambda app: app['title'])
48
+ image = Image.open('docStore/img/sdsn.png')
49
+ st.sidebar.image(image, width =200)
50
+
51
+ with st.sidebar:
52
+ selected = option_menu(None, [page["title"] for page in self.apps],
53
+ icons=[page["icon"] for page in self.apps],
54
+ menu_icon="cast", default_index=0)
55
+ st.markdown("---")
56
+
57
+
58
+ for index, item in enumerate(self.apps):
59
+ if item["title"] == selected:
60
+ self.apps[index]["function"]()
61
+ break
62
+
63
+
64
+ choice = st.sidebar.radio(label = 'Select the Document',
65
+ help = 'You can upload the document \
66
+ or else you can try a example document',
67
+ options = ('Upload Document', 'Try Example'),
68
+ horizontal = True)
69
+ add_upload(choice)
70
+
appStore/sdg_analysis.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../utils')
4
+
5
+ #import needed libraries
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+ from st_aggrid import AgGrid
12
+ from st_aggrid.shared import ColumnsAutoSizeMode
13
+ from utils.sdg_classifier import sdg_classification
14
+ from utils.sdg_classifier import runSDGPreprocessingPipeline, load_sdgClassifier
15
+ from utils.keyword_extraction import textrank
16
+ import logging
17
+ logger = logging.getLogger(__name__)
18
+ from utils.checkconfig import getconfig
19
+
20
+
21
+ # Declare all the necessary variables
22
+ config = getconfig('paramconfig.cfg')
23
+ model_name = config.get('sdg','MODEL')
24
+ split_by = config.get('sdg','SPLIT_BY')
25
+ split_length = int(config.get('sdg','SPLIT_LENGTH'))
26
+ split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
27
+ remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
28
+ split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
29
+ threshold = float(config.get('sdg','THRESHOLD'))
30
+ top_n = int(config.get('sdg','TOP_KEY'))
31
+
32
+
33
+ def app():
34
+
35
+ #### APP INFO #####
36
+ with st.container():
37
+ st.markdown("<h1 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h1>", unsafe_allow_html=True)
38
+ st.write(' ')
39
+ st.write(' ')
40
+
41
+ with st.expander("ℹ️ - About this app", expanded=False):
42
+
43
+ st.write(
44
+ """
45
+ The *SDG Analysis* app is an easy-to-use interface built \
46
+ in Streamlit for analyzing policy documents with respect to SDG \
47
+ Classification for the paragraphs/texts in the document and \
48
+ extracting the keyphrase per SDG label - developed by GIZ Data \
49
+ and the Sustainable Development Solution Network. \n
50
+ """)
51
+ st.write("""**Document Processing:** The Uploaded/Selected document is \
52
+ automatically cleaned and split into paragraphs with a maximum \
53
+ length of 120 words using a Haystack preprocessing pipeline. The \
54
+ length of 120 is an empirical value which should reflect the length \
55
+ of a “context” and should limit the paragraph length deviation. \
56
+ However, since we want to respect the sentence boundary the limit \
57
+ can breach and hence this limit of 120 is tentative. \n
58
+ """)
59
+ st.write("""**SDG cLassification:** The application assigns paragraphs \
60
+ to 16 of the 17 United Nations Sustainable Development Goals (SDGs).\
61
+ SDG 17 “Partnerships for the Goals” is excluded from the analysis due \
62
+ to its broad nature which could potentially inflate the results. \
63
+ Each paragraph is assigned to one SDG only. Again, the results are \
64
+ displayed in a summary table including the number of the SDG, a \
65
+ relevancy score highlighted through a green color shading, and the \
66
+ respective text of the analyzed paragraph. Additionally, a pie \
67
+ chart with a blue color shading is displayed which illustrates the \
68
+ three most prominent SDGs in the document. The SDG classification \
69
+ uses open-source training [data](https://zenodo.org/record/5550238#.Y25ICHbMJPY) \
70
+ from [OSDG.ai](https://osdg.ai/) which is a global \
71
+ partnerships and growing community of researchers and institutions \
72
+ interested in the classification of research according to the \
73
+ Sustainable Development Goals. The summary table only displays \
74
+ paragraphs with a calculated relevancy score above 85%. \n""")
75
+
76
+ st.write("""**Keyphrase Extraction:** The application extracts 15 \
77
+ keyphrases from the document, for each SDG label and displays the \
78
+ results in a summary table. The keyphrases are extracted using \
79
+ using [Textrank](https://github.com/summanlp/textrank)\
80
+ which is an easy-to-use computational less expensive \
81
+ model leveraging combination of TFIDF and Graph networks.
82
+ """)
83
+ st.write("")
84
+ st.write("")
85
+ st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
86
+ col1,col2,col3,col4 = st.columns([2,2,4,4])
87
+ with col1:
88
+ st.caption("Loading Time Classifier")
89
+ # st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
90
+ st.write("12 sec")
91
+ with col2:
92
+ st.caption("OCR File processing")
93
+ # st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
94
+ st.write("50 sec")
95
+ with col3:
96
+ st.caption("SDG Classification of 200 paragraphs(~ 35 pages)")
97
+ # st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
98
+ st.write("120 sec")
99
+ with col4:
100
+ st.caption("Keyword extraction for 200 paragraphs(~ 35 pages)")
101
+ # st.markdown('<div style="text-align: center;">3 sec</div>', unsafe_allow_html=True)
102
+ st.write("3 sec")
103
+
104
+
105
+
106
+
107
+ ### Main app code ###
108
+ with st.container():
109
+ if st.button("RUN SDG Analysis"):
110
+
111
+ if 'filepath' in st.session_state:
112
+ file_name = st.session_state['filename']
113
+ file_path = st.session_state['filepath']
114
+ classifier = load_sdgClassifier(classifier_name=model_name)
115
+ st.session_state['sdg_classifier'] = classifier
116
+ all_documents = runSDGPreprocessingPipeline(file_name= file_name,
117
+ file_path= file_path, split_by= split_by,
118
+ split_length= split_length,
119
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
120
+ split_overlap= split_overlap, remove_punc= remove_punc)
121
+
122
+ if len(all_documents['documents']) > 100:
123
+ warning_msg = ": This might take sometime, please sit back and relax."
124
+ else:
125
+ warning_msg = ""
126
+
127
+ with st.spinner("Running SDG Classification{}".format(warning_msg)):
128
+
129
+ df, x = sdg_classification(haystack_doc=all_documents['documents'],
130
+ threshold= threshold)
131
+ df = df.drop(['Relevancy'], axis = 1)
132
+ sdg_labels = x.SDG.unique()
133
+ textrank_keyword_list = []
134
+ for label in sdg_labels:
135
+ sdgdata = " ".join(df[df.SDG == label].text.to_list())
136
+ textranklist_ = textrank(textdata=sdgdata, words= top_n)
137
+ if len(textranklist_) > 0:
138
+ textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
139
+ textrank_keywords_df = pd.DataFrame(textrank_keyword_list)
140
+
141
+
142
+ plt.rcParams['font.size'] = 25
143
+ colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
144
+ # plot
145
+ fig, ax = plt.subplots()
146
+ ax.pie(x['count'], colors=colors, radius=2, center=(4, 4),
147
+ wedgeprops={"linewidth": 1, "edgecolor": "white"},
148
+ textprops={'fontsize': 14},
149
+ frame=False,labels =list(x.SDG_Num),
150
+ labeldistance=1.2)
151
+ # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
152
+
153
+
154
+ st.markdown("#### Anything related to SDGs? ####")
155
+
156
+ c4, c5, c6 = st.columns([1,2,2])
157
+
158
+ with c5:
159
+ st.pyplot(fig)
160
+ with c6:
161
+ labeldf = x['SDG_name'].values.tolist()
162
+ labeldf = "<br>".join(labeldf)
163
+ st.markdown(labeldf, unsafe_allow_html=True)
164
+ st.write("")
165
+ st.markdown("###### What keywords are present under SDG classified text? ######")
166
+
167
+ AgGrid(textrank_keywords_df, reload_data = False,
168
+ update_mode="value_changed",
169
+ columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
170
+ st.write("")
171
+ st.markdown("###### Top few SDG Classified paragraph/text results ######")
172
+
173
+ AgGrid(df, reload_data = False, update_mode="value_changed",
174
+ columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
175
+ else:
176
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
177
+ logging.warning("Terminated as no document provided")
178
+
179
+
docStore/img/giz_sdsn_small.jpg ADDED
docStore/img/ndc.png ADDED
docStore/img/paris.png ADDED
docStore/img/sdsn.png ADDED
docStore/ndcs/cca.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"climate_risks_droughts": {"category": "climate change adaptation","id": {0:"(I)NDC not submitted or not yet included in analysis",
2
+ 1: "Droughts are not climate risks concerns",
3
+ 2: "Droughts are among the five climate risks concerns"}},
4
+ "climate_risks_extreme_weather": {"category": "climate change adaptation", "id": {0:"(I)NDC not submitted or not yet included in analysis",
5
+ 1: "Extreme Weathers are not climate risks concerns",
6
+ 2: "Extreme Weathers are among the five climate risks concerns"}},
7
+ "climate_risks_floods": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
8
+ 1: "Floods are not climate risks concerns",
9
+ 2: "Floods are among the five climate risks concerns"}},
10
+ "climate_risks_temp_increase": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
11
+ 1: "Temperature increase are not climate risks concerns",
12
+ 2: "Temperature increase are among the five climate risks concerns"}},
13
+ "climate_risks_sea_level_rise": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
14
+ 1: "Sea level rise is not a climate risks concerns",
15
+ 2: "Sea level rise is among the five climate risks concerns"}},
16
+
17
+ "priority_sectors_agriculture": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
18
+ 1: "Agricultural sector is not that important in the context of adaptation ambitions",
19
+ 2: "In the context of adaptation ambitions Agricultural sector is very important for the country",
20
+ 3: "Agriculture sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
21
+
22
+ "priority_sectors_ecosystems": {"category": "climate change adaptation","id": {0 :"(I)NDC not submitted or not yet included in analysis",
23
+ 1 :"Biodiversity and preservation of Ecosystems is not that important in the context of adaptation ambitions",
24
+ 2: "In the context of adaptation ambitions Biodiversity and preservation of Ecosystems is very important for the country",
25
+ 3: "Biodiversity and Ecosystems plays an importance for the country, and therefore in the adaptation ambitions Biodiversity and Ecosystems has special actions and aims"}},
26
+ "priority_sectors_forestry": {"category": "climate change adaptation", "id": {0: "(I)NDC not submitted or not yet included in analysis",
27
+ 1: "Forestry sector is not that important in the context of adaptation ambitions",
28
+ 2: "In the context of adaptation ambitions Forestry sector is very important for the country",
29
+ 3: "Forestry sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
30
+ "priority_sectors_health": {"category": "climate change adaptation","id": { 0: "(I)NDC not submitted or not yet included in analysis",
31
+ 1: "Health sector is not that important in the context of adaptation ambitions",
32
+ 2: "In the context of adaptation ambitions Health sector is very important for the country",
33
+ 3: "Health sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
34
+
35
+ "priority_sectors_water": {"category": "climate change adaptation","id": { 0 : "(I)NDC not submitted or not yet included in analysis",
36
+ 1: "Water sector is not that important in the context of adaptation ambitions",
37
+ 2: "In the context of adaptation ambitions Water sector is very important for the country",
38
+ 3: "Water sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
39
+
40
+ "vulnerability_agriculture": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
41
+ 1: "Agriculture is a not a vulnerable sector",
42
+ 2: "Agriculture is a vulnerable sector"}},
43
+ "vulnerability_coastal_zones": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
44
+ 1: "Coastal Zone is a not a vulnerable sector",
45
+ 2: "Coastal Zone is a vulnerable sector"}},
46
+ "vulnerability_ecosystems": {"category": "climate change adaptation", "id":{ 0: "(I)NDC not submitted or not yet included in analysis",
47
+ 1: "Biodiversity and Ecosystems is a not a vulnerable sector",
48
+ 2: "Biodiversity and Ecosystems is a vulnerable sector"}},
49
+ "vulnerability_health": {"category": "climate change adaptation","id": {0:"(I)NDC not submitted or not yet included in analysis",
50
+ 1: "Health is a not a vulnerable sector",
51
+ 2: "Health is a vulnerable sector"}},
52
+ "vulnerability_water": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
53
+ 1: "Water is a not a vulnerable sector",
54
+ 2: "Water is a vulnerable sector"}},
55
+
56
+ "costs_of_adaptation": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
57
+ 1: "The partial cost of adaptation is tentatively around few million dollars",
58
+ 2: " The cost of adaptation will be 0-1 billion US$ until 2030",
59
+ 3: " The cost of adaptation will be 1-5 billion US$ until 2030",
60
+ 4: " The cost of adaptation will be 5-10 billion US$ until 2030",
61
+ 5: " The cost of adaptation will be 10-20 billion US$ until 2030",
62
+ 6: "The cost of adaptation will be more than 20 billion US$ until 2030"}},
63
+ "costs_of_future_climate_related_hazards": {"category": "climate change adaptation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
64
+ 1: "The future losses from climate change will be huge",
65
+ 2: "The climate hazards cause significant loss to economy and life, and the cost of Future losses could go around few million dollars"}},
66
+
67
+ "costs_of_recent_climate_related_hazards": {"category": "climate change adaptation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
68
+ 1: "No losses indicated",
69
+ 2: "In the recent climate hazards there has been significant Economic losses.",
70
+ 3: "In the recent climate hazards the impact on human life has been significant",
71
+ 4: "In the recent climate hazards the impact on human life has been significant and the economic loss amounts to 5.3"}},
72
+ "quantified_adaptation_targets": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
73
+ 1:"No quantitative adaptation target",
74
+ 2: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years",
75
+ 3: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years",
76
+ 4: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years"}},
77
+
78
+ "slow_onset_others": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
79
+ 1:"Apart from sea level rise and temperature increase, no other specific slow onset process",
80
+ 2: "There are other slow onset processes additional to sea level rise and temperature increase like loss of biodiversity, desertification, glacier retreat, salinisation or ocean acidification"}},
81
+ }
docStore/ndcs/ccm.txt ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"agriculture": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
2
+ 1: "Agriculture sector is not considered for climate change mitigation",
3
+ 2: "Agriculture sector contribution in greenhouse gases emission is significant and therefore is part of climate change mitigation",
4
+ 3: "Agriculture sector contribution in greenhouse gases emission is significant. Given the importance of agriculture sector for economy and and its adverse contribution in greenhouse gas emissions it is a Focus area for climate change mitigation and needs to be prioritised"}},
5
+
6
+ "energy_efficiency": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
7
+ 1: "Energy Efficiency is not considered for climate change mitigation",
8
+ 2: "Energy sector contribution in greenhouse gases emission is significant and therefore Energy Efficiency is part of climate change mitigation",
9
+ 3: "Energy sector contribution in greenhouse gases emission is significant. Given the importance of the energy sector for economy and its adverse contribution to greenhouse gas emissions, energy efficiency is a Focus area for climate change mitigation and needs to be prioritised. The quantified renewable energy targets like for example in solar, geothermal, wind power are provided."}},
10
+
11
+ "fossil_fuel_production": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
12
+ 1:"There is no recorded FFP (2016)",
13
+ 2: "Fossil fuel Production is important for economy",
14
+ 3:"Fossil fuel Production is important to provide for the basic requirements of the people in the country",
15
+ 4:"The country's dependence on Fossil fuel production to meet energy and other requirements cannot be ignored, however the climate change impact due to the same cannot be ignored. The plans and actions as part of climate change mitigation includes measures to address production (e.g. subsidy removal; taxes); cleaner production",
16
+ 5: "Fossil fuel Production is important to provide for the basic requirements of the people in the country.The country's dependence on Fossil fuel production to meet energy and other requirements cannot be ignored, however the climate change impact due to same cannot be ignored. The plans and actions as part of climate change mitigation includes measures to address production (e.g. subsidy removal; taxes); cleaner production"}},
17
+ "fossil_fuel_subsidiaries": {"category": "climate change mitigation","id":{0: "(I)NDC not submitted or not yet included in analysis",
18
+ 1:"fossil Fuel subsidiaries are not considered",
19
+ 2:"the alternates/subsidiaries to fossil Fuel need to be considered to meet the mitigations ambitions",
20
+ 3:"The fossil fuel contribution towards greenhouse gas emissions is very high and therefore there is a need to find the alternatives/substitutes for the same. The replacement of fossil fuels with alternates is a priority focus area in the mitigation actions to meet mitigation ambitions."}},
21
+
22
+ "land_use_and_forestry": {"category": "climate change mitigation", "id":{0:"(I)NDC not submitted or not yet included in analysis",
23
+ 1:"land use and forestry are not considered",
24
+ 2:"the land use and forestry contribute to greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
25
+ 3:"The land use and forestry contribution towards greenhouse gas emissions is significant and therefore there is need to quantify the mitigation potential land use and forestry."}},
26
+ "land_use_change": {"category": "climate change mitigation", "id": {0:"(I)NDC not submitted or not yet included in analysis",
27
+ 1: "land use change Not mentioned",
28
+ 2: "land use change is being considered, but there are no mitigation targets",
29
+ 3: "land use change is being considered as part of mitigation targets",
30
+ 4: "land use change can play an important role in mitigation efforts. As part of mitigation plan there are quantified targets for land use change.",
31
+ 5: "land use change can play an important role in mitigation efforts. As part of mitigation plan there are quantified targets for land use change."}},
32
+
33
+ "renewable_energy": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
34
+ 1:"renewable energy is not considered",
35
+ 2:"Renewable energy are direct measure to reduce the greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
36
+ 3:"Renewable energy are direct measure to reduce the greenhouse gas emissions and therefore there is need to quantify the mitigation potential in terms of renewable energy targets and specific sub-sectors of action (e.g. solar, geothermal, wind power)"}},
37
+
38
+ "temp_target": {"category": "climate change mitigation", "id": { 0: "(I)NDC not submitted or not yet included in analysis",
39
+ 1:"Not mentioning global effort to limit global temperature increase to 2 degree celsius or 1.5 degree C",
40
+ 2:"there is urgent need to limit global temperature increase to 2 degree celsius",
41
+ 3:"there is urgent need to limit global temperature increase to 1.5 degree C",
42
+ 4:"there is urgent need to limit global temperature increase to 2 degree celsius",
43
+ 5:"there is urgent need to limit global temperature increase to 1.5 degree C"}},
44
+ "waste": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
45
+ 1:"Waste as a topic is not mentioned",
46
+ 2:"Waste reduction or management can play important role in mitigation plan and ambitions",
47
+ 3:"Waste reduction or management can play an important role in sustainable development and hence is a focus area in mitigation plan and ambitions"}},
48
+ "transport": {"category": "climate change mitigation","id": {0:"(I)NDC not submitted or not yet included in analysis",
49
+ 1:"Transport is not considered",
50
+ 2:"Transport contribute to greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
51
+ 3:"transport sector contribution towards greenhouse gas emissions is significant and therefore there is need to focus/prioritise the transport sector to meet the mitigation potential"}},
52
+
53
+ "reducing_non_co2_gases": {"category": "climate change mitigation","id": {0:"(I)NDC not submitted or not yet included in analysis",
54
+ 1:"Reduction of non CO2 gases not indicated",
55
+ 2:"Efforts should be made in reduction of NOn CO2 gases too."}},
56
+
57
+
58
+ "base_year": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
59
+ 1: "No base year",
60
+ 2: "the base year or reference point for measurement of emissions is year 19XX"}},
61
+
62
+ "carbon_capture_and_storage": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
63
+ 1: "carbon capture and storage not indicated",
64
+ 2:"With Technology advancement the mitigation efforts can also in form of carbon capture and storage.",
65
+ 3: "With technological advancement the mitigation efforts can also be in form of carbon capture and storage. This should be a focus area and more options need to be explored to do carbon capture and storage."}},
66
+
67
+ "costs_of_ccm": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
68
+ 1: "(partial) costs not indicated",
69
+ 2: " the mitigation actions and efforts will cost 0-1 billion US$ until 2030",
70
+ 3:"the mitigation actions and efforts will cost 1-5 billion US$ until 2030",
71
+ 4:"the mitigation actions and efforts will cost5-10 billion US$ until 2030",
72
+ 5: "the mitigation actions and efforts will cost 10-20 billion US$ until 2030",
73
+ 6:"the mitigation actions and efforts will cost will be more than 20 billion US$ until 2030"}},
74
+
75
+ "market_mechanisms": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
76
+ 1: "International market mechanisms not mentioned",
77
+ 2:"One good mechanism to deal with greenhouse gas emissions is to explore International market mechanisms",
78
+ 3: "International market mechanisms are not a good way of dealing with mitigation ambitions and therefore should not be considered. Greenhouse gas emissions cannot be part of tradable commodity.",
79
+ 4: "Carbon emissions of greenhouse gas emissions are now a tradable commodity and these can provide a good source for funds and achieving mitigation ambitions. Therefore it is important to exploreInternational market mechanisms. It is important that such means should be explored and there will be plan of actions soon to include these in meeting mitigations target",
80
+ 5: "Carbon emissions of greenhouse gas emissions are now a tradable commodity and these can provide a good source for funds and achieving mitigation ambitions. Therefore it is important to exploreInternational market mechanisms. It is important that such means should be explored and there will be plan of actions soon to include these in meeting mitigations target"}},
81
+
82
+ "redd": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
83
+ 1: "REDD+ not mentioned",
84
+ 2: "Reducing Emissions of Deforestation and Forest Degradation/REDD+",
85
+ 3: "Reducing Emissions of Deforestation and Forest Degradation/REDD+"}},
86
+ }
docStore/ndcs/countryList.txt ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {'Afghanistan': 'AFG',
2
+ 'Albania': 'ALB',
3
+ 'Algeria': 'DZA',
4
+ 'Andorra': 'AND',
5
+ 'Angola': 'AGO',
6
+ 'Antigua and Barbuda': 'ATG',
7
+ 'Argentina': 'ARG',
8
+ 'Armenia': 'ARM',
9
+ 'Australia': 'AUS',
10
+ 'Azerbaijan': 'AZE',
11
+ 'Bahamas': 'BHS',
12
+ 'Bahrain': 'BHR',
13
+ 'Bangladesh': 'BGD',
14
+ 'Barbados': 'BRB',
15
+ 'Belarus': 'BLR',
16
+ 'Belize': 'BLZ',
17
+ 'Benin': 'BEN',
18
+ 'Bhutan': 'BTN',
19
+ 'Bolivia': 'BOL',
20
+ 'Bosnia and Herzegovina': 'BIH',
21
+ 'Botswana': 'BWA',
22
+ 'Brazil ': 'BRA',
23
+ 'Brunei Darussalam': 'BRN',
24
+ 'Burkina Faso': 'BFA',
25
+ 'Burundi ': 'BDI',
26
+ 'Cabo Verde': 'CPV',
27
+ 'Cambodia': 'KHM',
28
+ 'Cameroon': 'CMR',
29
+ 'Canada': 'CAN',
30
+ 'Central African Republic': 'CAF',
31
+ 'Chad': 'TCD',
32
+ 'Chile': 'CHL',
33
+ 'China': 'CHN',
34
+ 'Colombia': 'COL',
35
+ 'Comoros': 'COM',
36
+ 'Congo': 'COG',
37
+ 'Cook Islands': 'COK',
38
+ 'Costa Rica': 'CRI',
39
+ 'Cote dIvoire': 'CIV',
40
+ 'Cuba': 'CUB',
41
+ "Democratic People's Republic of Korea": 'PRK',
42
+ 'Democratic Republic of Congo': 'COD',
43
+ 'Djibouti': 'DJI',
44
+ 'Dominica': 'DMA',
45
+ 'Dominican Republic': 'DOM',
46
+ 'Ecuador': 'ECU',
47
+ 'Egypt': 'EGY',
48
+ 'El Salvador': 'SLV',
49
+ 'Equatorial Guinea': 'GNQ',
50
+ 'Eritrea': 'ERI',
51
+ 'Ethiopia': 'ETH',
52
+ 'European Union': 'EU',
53
+ 'Fiji': 'FJI',
54
+ 'Gabon': 'GAB',
55
+ 'Gambia': 'GMB',
56
+ 'Georgia': 'GEO',
57
+ 'Ghana': 'GHA',
58
+ 'Grenada': 'GRD',
59
+ 'Guatemala': 'GTM',
60
+ 'Guinea': 'GIN',
61
+ 'Guinea Bissau': 'GNB',
62
+ 'Guyana': 'GUY',
63
+ 'Haiti': 'HTI',
64
+ 'Honduras': 'HND',
65
+ 'Iceland': 'ISL',
66
+ 'India': 'IND',
67
+ 'Indonesia': 'IDN',
68
+ 'Iran': 'IRN',
69
+ 'Iraq': 'IRQ',
70
+ 'Israel': 'ISR',
71
+ 'Jamaica': 'JAM',
72
+ 'Japan': 'JPN',
73
+ 'Jordan': 'JOR',
74
+ 'Kazakhstan': 'KAZ',
75
+ 'Kenya': 'KEN',
76
+ 'Kingdom of Eswatini': 'SWZ',
77
+ 'Kiribati': 'KIR',
78
+ 'Kuwait': 'KWT',
79
+ 'Kyrgyzstan': 'KGZ',
80
+ 'Lao Peoples Democratic Republic': 'LAO',
81
+ 'Lebanon': 'LBN',
82
+ 'Lesotho': 'LSO',
83
+ 'Liberia': 'LBR',
84
+ 'Libya': 'LBY',
85
+ 'Liechtenstein': 'LIE',
86
+ 'Madagascar': 'MDG',
87
+ 'Malawi': 'MWI',
88
+ 'Malaysia': 'MYS',
89
+ 'Maldives': 'MDV',
90
+ 'Mali': 'MLI',
91
+ 'Marshall Islands': 'MHL',
92
+ 'Mauritania': 'MRT',
93
+ 'Mauritius': 'MUS',
94
+ 'Mexico': 'MEX',
95
+ 'Micronesia': 'FSM',
96
+ 'Monaco': 'MCO',
97
+ 'Mongolia': 'MNG',
98
+ 'Montenegro': 'MNE',
99
+ 'Morocco': 'MAR',
100
+ 'Mozambique': 'MOZ',
101
+ 'Myanmar': 'MMR',
102
+ 'Namibia': 'NAM',
103
+ 'Nauru': 'NRU',
104
+ 'Nepal': 'NPL',
105
+ 'New Zealand': 'NZL',
106
+ 'Nicaragua': 'NIC',
107
+ 'Niger': 'NER',
108
+ 'Nigeria': 'NGA',
109
+ 'Niue': 'NIU',
110
+ 'Norway': 'NOR',
111
+ 'Oman': 'OMN',
112
+ 'Pakistan': 'PAK',
113
+ 'Palau ': 'PLW',
114
+ 'Palestine': 'PSE',
115
+ 'Panama': 'PAN',
116
+ 'Papua New Guinea': 'PNG',
117
+ 'Paraguay': 'PRY',
118
+ 'Peru': 'PER',
119
+ 'Philippines': 'PHL',
120
+ 'Qatar': 'QAT',
121
+ 'Republic of Moldova': 'MDA',
122
+ 'Republic of North Macedonia': 'MKD',
123
+ 'Russian Federation': 'RUS',
124
+ 'Rwanda': 'RWA',
125
+ 'Saint Kitts and Nevis': 'KNA',
126
+ 'Saint Lucia': 'LCA',
127
+ 'Saint Vincent and the Grenadines': 'VCT',
128
+ 'Samoa': 'WSM',
129
+ 'San Marino': 'SMR',
130
+ 'Sao Tome and Principe': 'STP',
131
+ 'Saudi Arabia': 'SAU',
132
+ 'Senegal': 'SEN',
133
+ 'Serbia': 'SRB',
134
+ 'Seychelles': 'SYC',
135
+ 'Sierra Leone': 'SLE',
136
+ 'Singapore': 'SGP',
137
+ 'Solomon Islands': 'SLB',
138
+ 'Somalia': 'SOM',
139
+ 'South Africa': 'ZAF',
140
+ 'South Korea': 'KOR',
141
+ 'South Sudan': 'SSD',
142
+ 'Sri Lanka': 'LKA',
143
+ 'Sudan': 'SDN',
144
+ 'Suriname': 'SUR',
145
+ 'Switzerland': 'CHE',
146
+ 'Syria': 'SYR',
147
+ 'Tajikistan': 'TJK',
148
+ 'Thailand': 'THA',
149
+ 'Timor Leste': 'TLS',
150
+ 'Togo': 'TGO',
151
+ 'Tonga': 'TON',
152
+ 'Trinidad and Tobago': 'TTO',
153
+ 'Tunisia': 'TUN',
154
+ 'Turkey': 'TUR',
155
+ 'Turkmenistan': 'TKM',
156
+ 'Tuvalu': 'TUV',
157
+ 'Uganda': 'UGA',
158
+ 'Ukraine': 'UKR',
159
+ 'United Arab Emirates': 'ARE',
160
+ 'United Kingdom': 'GBR',
161
+ 'United Republic of Tanzania': 'TZA',
162
+ 'United States of America': 'USA',
163
+ 'Uruguay': 'URY',
164
+ 'Uzbekistan': 'UZB',
165
+ 'Vanuatu': 'VUT',
166
+ 'Venezuela ': 'VEN',
167
+ 'Vietnam': 'VNM',
168
+ 'Yemen': 'YEM',
169
+ 'Zambia': 'ZMB',
170
+ 'Zimbabwe': 'ZWE'}
docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Ethiopia 2030: The Pathway to Prosperity
2
+ Ten Years Perspective Development Plan (2021 � 2030)
3
+ 1. Baselines and Assumptions
4
+ 2. Strategic pillars
5
+ 3. Departures
6
+ 4. Macroeconomic goals
7
+ 5. Implications of the COVID-19 pandemic and necessary mitigation measures
8
+ 6. Potentials/capabilities
9
+ 7. Focus areas
10
+ 7.1. Productive sectors
11
+ 7.2. Services sector
12
+ 7.3. Enabling sectors
13
+ 8. Balanced and competitive development (nationally, regionally and locally)
14
+ 9. Monitoring and Evaluation
15
+ Content
16
+ 1. Baselines and Assumptions
17
+ Poverty Reduction (%)
18
+ Key performances of previous years
19
+ 45.5 44.2
20
+ 38.7
21
+ 29.6
22
+ 23.5
23
+ 19
24
+ 0
25
+ 5
26
+ 10
27
+ 15
28
+ 20
29
+ 25
30
+ 30
31
+ 35
32
+ 40
33
+ 45
34
+ 50
35
+ 1994 2000 2005 2011 2016 2020
36
+ Percent
37
+ Year
38
+ Proportion of people living below poverty line
39
+ 10.5
40
+ 8.8
41
+ 10.1
42
+ 7.7
43
+ 9
44
+ 5.19-6.20
45
+ 0 2 4 6 8 10 12
46
+ GTP I: 2011-2015
47
+ GTP II: 2015/16
48
+ GTP II: 2016/17
49
+ GTP II: 2017/18
50
+ GTP II: 2018/19
51
+ GTP II: 2019/20 (projection, with
52
+ COVID-19)
53
+ GDP growth rate (%)
54
+ 1. Baselines and Assumptions
55
+ Share of economic sectors in GDP (%) Merchandise export as % of GDP
56
+ 8.66
57
+ 7.33
58
+ 6.57
59
+ 5.93
60
+ 4.91
61
+ 3.86 3.56 3.37
62
+ 2.77
63
+ 0
64
+ 1
65
+ 2
66
+ 3
67
+ 4
68
+ 5
69
+ 6
70
+ 7
71
+ 8
72
+ 9
73
+ 10
74
+ Percent
75
+ Year
76
+ 46.9
77
+ 45
78
+ 43.5
79
+ 41.4
80
+ 39.5
81
+ 37.1 35.9
82
+ 34.5
83
+ 32.8
84
+ 13.4
85
+ 15
86
+ 17.3
87
+ 18.8
88
+ 21
89
+ 23.5
90
+ 25.7 26.9 27.8
91
+ 4.7 4.8 5 5.3 5.6 6.1 6.9 6.8 6.8
92
+ 7.1
93
+ 8.6
94
+ 10.7 12
95
+ 14.2
96
+ 16.2
97
+ 17.8 19.1 20.1
98
+ 39.8 40.1 39.2 39.8 39.4 38.4 38.6 39.4
99
+ 0
100
+ 5
101
+ 10
102
+ 15
103
+ 20
104
+ 25
105
+ 30
106
+ 35
107
+ 40
108
+ 45
109
+ 50
110
+ 2010/11 2011/12 2012/13 2013/14 2014/15 2015/16 2016/17 2017/18 2018/19
111
+ Percent
112
+ Agriculture Industry Manufacturing Construction Services
113
+ 1. Baselines and Assumptions
114
+ Labour force participation (2013)
115
+ 73%
116
+ 7%
117
+ 20%
118
+ Agriculture
119
+ Industry
120
+ Services
121
+ 7%
122
+ 22%
123
+ 71%
124
+ Agriculture
125
+ Industry
126
+ Services
127
+ Urban labour force participation (2013)
128
+ 1. Baselines and Assumptions
129
+ High and increasing Unemployment Rate
130
+ � Urban unemployment rate = 19.1% in 2018
131
+ � Youth unemployment rate = 25.3 %
132
+ ? Male = 18.6%
133
+ ? Female 30.9 %
134
+ � Rural unemployment rate = 2% in 2013
135
+ � Declining per capita rural land creating
136
+ disguised unemployment
137
+ 402,869
138
+ 471,535
139
+ Male Female Total Male Female Total
140
+ 2014 2018
141
+ 15-19 yr. 20-24 yr. 25-29 yr. Linear (20-24 yr.)
142
+ Number of unemployed people in urban areas
143
+ 1. Baselines and Assumptions
144
+ Challenges
145
+ 1. Macroeconomic imbalances
146
+ ?Sustained high inflation
147
+ ?High and rising unemployment especially
148
+ in urban areas
149
+ ?High and rising debt burden
150
+ ?Chronic foreign currency shortage
151
+ ?Sluggish (though encouraging) rate of
152
+ structural change
153
+ 2. Vulnerability to shocks (COVID-19, Climate
154
+ changes, Desert Locust infestation, etc)
155
+ 3. Poor quality and high inequity in
156
+ infrastructure projects
157
+ 4. Poor quality services in health and
158
+ education
159
+ � High repetition and dropout rates from school
160
+ 1. Baselines and Assumptions
161
+ � Poor quality of growth and slow
162
+ structural change
163
+ � Excessive aid and loan
164
+ dependence for financing
165
+ infrastructural and construction
166
+ investments
167
+ � Limited success in expanding
168
+ manufacturing and modern
169
+ agriculture which have high job
170
+ creation potentials
171
+ � Weak institutional capacity as
172
+ the main culprit of all failures
173
+ ? Provision of quality services
174
+ (electricity, water, telephone,
175
+ internet)
176
+ ? Creation of enough jobs and
177
+ improved living standards
178
+ ? Generation of reliable foreign
179
+ exchange revenue and debtsustainable
180
+ national economic
181
+ capacity
182
+ ? Completion of development
183
+ projects and investment plans
184
+ under public-private
185
+ partnerships
186
+ � Low reward for merit, productivity and effort
187
+ while low disincentive for laziness, wastefulness
188
+ and corruption
189
+ � Slow institutional change and transformation in:
190
+ ? Government policies
191
+ ? Investor attitude
192
+ ? Youth behaviour
193
+ ? Role of the intellectuals
194
+ � The need for sustained increase in production
195
+ and productivity
196
+ � The need to set a common national vision to
197
+ achieve major successes with consensus and
198
+ popular legitimacy
199
+ Major areas of failure in the economy
200
+ 1. Baselines and Assumptions
201
+ � Poor quality of growth and slow
202
+ structural change
203
+ � Excessive aid and loan
204
+ dependence for financing
205
+ infrastructural and construction
206
+ investments
207
+ � Limited success in expanding
208
+ manufacturing and modern
209
+ agriculture which have high job
210
+ creation potentials
211
+ � Weak institutional capacity as
212
+ the main culprit of all failures
213
+ ? Provision of quality services
214
+ (electricity, water, telephone,
215
+ internet)
216
+ ? Creation of enough jobs and
217
+ improved living standards
218
+ ? Generation of reliable foreign
219
+ exchange revenue and debtsustainable
220
+ national economic
221
+ capacity
222
+ ? Completion of development
223
+ projects and investment plans
224
+ under public-private
225
+ partnerships
226
+ � Low reward for merit, productivity and effort
227
+ while low disincentive for laziness, wastefulness
228
+ and corruption
229
+ � Slow institutional change and transformation in:
230
+ ? Government policies
231
+ ? Investor attitude
232
+ ? Youth behaviour
233
+ ? Role of the intellectuals
234
+ � The need for sustained increase in production
235
+ and productivity
236
+ � The need to set a common national vision to
237
+ achieve major successes with consensus and
238
+ popular legitimacy
239
+ Major areas of failure in the economy
240
+ 2. Departures
241
+ 1. Emphasis on quality of economic growth
242
+ 2. Participation and coordination of sectors in the planning process
243
+ 3. Sectoral linkages and multi-sectoral development focus
244
+ 4. Preparation of national development corridors based on development potentials
245
+ 5. Focus on solving institutional bottlenecks
246
+ 6. The ongoing home grown economic reform programme as a sprinting board
247
+ 7. Emphasis on resilience building, innovation and entrepreneurship
248
+ 3. Strategic pillars
249
+ 1. Ensure quality growth
250
+ 2. Improve productivity and competitiveness
251
+ 3. Undertake institutional transformation
252
+ 4. Ensure private sector's leadership in the economy
253
+ 5. Ensure equitable participation of women and children
254
+ 6. Build climate resilient green economy
255
+ 3. Strategic pillars
256
+ � Increasing export revenues and substituting imports by
257
+ reducing production costs
258
+ � Availing quality and massive infrastructure
259
+ ? Linking infrastructural development with development corridors
260
+ � Producing required human resources with quality
261
+ � Producing enough and quality human resources
262
+ � Prioritizing innovative production systems
263
+ � Linking incentives with export revenue and job creation
264
+ performances
265
+ � Modernizing and enhancing the logistic system
266
+ � Creating technological competences needed for longterm
267
+ growth
268
+ � The economic growth should ensure:
269
+ ? Participation of all citizens and equitable utilization of the
270
+ growth proceeds
271
+ ? Improved standard of living of every citizen
272
+ ? Reduced poverty in all indicators
273
+ ? Reduced inflation and unemployment
274
+ � The economic growth should lead to increased
275
+ aggregate supply
276
+ � Focus on modern agriculture, manufacturing and
277
+ mining
278
+ � Emphasis on exploiting the sources of growth through
279
+ structural change
280
+ 1.Ensuring quality economic growth 2. Raising production and productivity
281
+ 3. Strategic pillars
282
+ � Build democratic and judicial institutions that ensure elite bargain,
283
+ national consensus, common vision and government legitimacy
284
+ � Build private sector and competition friendly bureaucracy
285
+ � Coordinate with parents, the society and teachers to make
286
+ educational institutions centers of excellence and virtuous citizens
287
+ � Coordinate with parents as well as social and religious leaders to
288
+ encourage religious institutions and their teachings contribute
289
+ towards poverty reduction efforts
290
+ � Prepare policies, strategies and legal frameworks for achieving
291
+ prosperity
292
+ � Increased focus on innovation and research
293
+ � Creating strong social security system
294
+ 3. Institutional Transformation 4. Private sector's leadership in the economy
295
+ � Create conducive investment climate and incentivize
296
+ domestic investors in key sectors
297
+ � Build strong and market-led public-private partnerships in
298
+ order to ensure the establishment of inclusive and
299
+ pragmatic market economy
300
+ � Enhance access and quality of infrastructure to attract
301
+ quality foreign direct investment
302
+ � Identify new sources of growth, empower and stimulate
303
+ the private sector, and supplement the private sector in
304
+ strategic areas
305
+ � Emphasis for public-private partnership on problem
306
+ solving innovations and research activities
307
+ 3. Strategic pillars
308
+ � Ensure gender equity in economic and social
309
+ sectors
310
+ ? Participation of women at all levels of education
311
+ ? Asset ownership of women
312
+ � Ensure fair participation of women and youth in
313
+ leadership and decision making positions
314
+ � Create awareness among citizens about the role of
315
+ women and youth in the country�s overall
316
+ development
317
+ � Increase basin development efforts to fight land
318
+ degradation and to reduce pollutions
319
+ � Improve productivity and reduce GHG emissions
320
+ � Increase forest protection and development
321
+ � Increase production of electricity from renewable
322
+ sources for domestic use and for export
323
+ � Focus on modern and energy saving technologies
324
+ 5. Equitable participation of women and children 6. Climate resilient green economy
325
+ 4. Macroeconomic Goals
326
+ Assumptions
327
+ ? Requirement to significantly reduce
328
+ poverty
329
+ ? Available national potentials
330
+ ? Potential for investment in the economy
331
+ ? Existing potentials in each sector
332
+ ? Low productivity that needs to be
333
+ improved
334
+ � Make Ethiopia a middle income
335
+ economy by 2022
336
+ � Raise per capita income to USD 1,115
337
+ in 2022
338
+ ? Threshold for middle-income is USD 1,026
339
+ ? Plus human development index and
340
+ economic vulnerability index
341
+ � Raise per capita income to USD 2,220
342
+ by 2030
343
+ Sectoral growth Targets (2021-2030)
344
+ Assured middle- income potential
345
+ 10.2%
346
+ Average
347
+ Growth
348
+ Target
349
+ Percentage of population below poverty line
350
+ 4. Macroeconomic Goals
351
+ Structural change
352
+ Financing Gaps
353
+ Reduce urban unemployment to less than 9%
354
+ ?1.36 million new jobs need to be
355
+ created per annum
356
+ Sectoral composition of GDP Labour force participation
357
+ Economic
358
+ Sectors
359
+ Performance Target
360
+ 2011 2015 2018/19 2030
361
+ Agriculture 45 39.7 32.8 22.0
362
+ Industry 15.1 21.2 27.6 35.9
363
+ Manufacturing 4.7 5.5 6.8 17.2
364
+ Services 39.9 39 39.4 42.1
365
+ 5. Implications of the COVID-19 pandemic and necessary mitigation measures
366
+ � GDP growth for 2019/20 fiscal year is projected to be lower than its target of 9.0% by between 2.81
367
+ and 3.80 percentage points (equivalent to 58.3 - 78.8 billion birr) due to COVID-19 pandemic
368
+ � If the current scenario continues, next year�s GDP growth could decline by 2.8 percentage points
369
+ � Returning the economy to its high growth trajectory requires focusing on sectors with high
370
+ productivity and job creation potentials
371
+ � Public investment should focus on empowering the private sector
372
+ � Promoting both domestic and foreign investments with the right set of incentives (merit based)
373
+ � Modernizing production systems and improving uptake of technology
374
+ � Conducting demand analysis for export commodities to remedy for the declining trend in exports
375
+ and foreign exchange earnings.
376
+ 6. Potentials
377
+ � Endowment of various natural resources contributing to the growth potential
378
+ � Huge unutilized arable land creates great potential for the success of the plan
379
+ � Endowment of gemstones, ornamental, energy, metals, and metallic minerals
380
+ � Gold, coal, iron ore, potash, tantalum, marble, petroleum and other natural resources
381
+ Natural
382
+ Resources
383
+ � Large youth population and potential for demographic dividend
384
+ � Cumulative capacity in education and health
385
+ � Positive attitude and noble culture of reaching agreement among citizens
386
+ Human
387
+ capital
388
+ 6. Potentials
389
+ Built physical and material capitals
390
+ ?Transport and communication
391
+ ? Irrigation infrastructures for modern agriculture
392
+ ?Industrial Parks
393
+ ?Mega energy infrastructures
394
+ Physical
395
+ capital
396
+ Unexploited
397
+ growth
398
+ potentials
399
+ � Utilizing the tourism potential through modernization
400
+ � Using the mining subsector as a source of input as well as a competitive industry in its
401
+ own right
402
+ 6. Potentials
403
+ � Solving supply side bottlenecks to satisfy the existing demand
404
+ � Improving international acceptance and reliable partnerships
405
+ ? The �medemer�/synergy philosophy
406
+ ? The ongoing political reform measures
407
+ ? The Homegrown Economic Reform programme
408
+ � Increased finance from partners and multilateral institutions
409
+ ? Increased availability of foreign exchange
410
+ ? Reduced debt stress for the short to medium term
411
+ ? Increased potential for development
412
+ Increased
413
+ demand as
414
+ potential
415
+ Political Capital
416
+ Continental
417
+ and regional
418
+ integrations
419
+ � Regional and continental economic integration agreements
420
+ � International and continental free trade agreements
421
+ 6. Potentials
422
+ Low
423
+ technology as
424
+ a potential
425
+ � Undeniably low status of technological development
426
+ � International mobility and spillover effect of technology
427
+ � Potential for development and catching up by filling the technological gaps
428
+ � Doubling crop productivity from the current 24-36 quintals per hectare will result
429
+ in 7% increase in crop production
430
+ � Raise the production efficiency of manufacturing from the current 50% to 80%
431
+ 7. Focus Areas
432
+ 7.1. Productive sectors: agriculture, manufacturing, mining
433
+ 7.2. Service sector: tourism
434
+ 7.3. Enabling sectors: energy, transport, sustainable finance,
435
+ innovation and technology, urban development, irrigation,
436
+ human capital development
437
+ 7.1. Productive sectors
438
+ Agriculture Objectives
439
+ 1. Free agriculture from rain dependence
440
+ 2. Agricultural mechanization services
441
+ 3. Contract farming, cluster approach and
442
+ land consolidation
443
+ 4. Livestock, animal feed and animal health
444
+ 5. Horticulture (irrigation and urban farming)
445
+ 6. Private sector participation
446
+ 7. Institutional implementation capacity
447
+ 8. Climate resilient sustainable agricultural
448
+ development
449
+ 1. Improve income and livelihood options for farming and pastoral
450
+ communities through increased productivity and competitiveness
451
+ 2. Modernize agriculture and ensure national food and nutrition security
452
+ 3. Raise export of agricultural output and substitute imports
453
+ 4. Make agriculture a viable and profitable enterprise through value addition
454
+ 5. Create rural employment opportunities
455
+ 6. Enhance livestock health access and quality
456
+ 7. Preserve animal genetic resources and increase pastoral research
457
+ 8. Improve the development of animal feed and access to markets
458
+ 9. Develop livestock specific extension package for each livestock type
459
+ Focus Areas
460
+ 7.1. Productive sector
461
+ Manufacturing Industry
462
+ Objectives
463
+ 1. Production of quality and competitive food, textile, housing and
464
+ pharmaceutical products for export and domestic markets
465
+ 2. Production and productivity of existing manufacturing industries
466
+ 3. Utilization of locally available inputs
467
+ 4. Value chains, linkages and interdependencies
468
+ 5. Linkages between large scale metallurgical and engineering,
469
+ chemical and pharmaceutical industries with other industries
470
+ 6. Job creation, cluster approaches and expanding small and medium
471
+ scale manufacturing
472
+ 7. Private sector participation and partnership
473
+ 1. Establish basis for domestic industrialization
474
+ 2. Value addition through enhanced inter-sectoral
475
+ linkages
476
+ 3. Enhance productivity through private sector
477
+ leadership and supportive role of the
478
+ government
479
+ ? Create job opportunities for the youth leaving
480
+ agriculture and concentrating in urban areas
481
+ ? Make exportable commodities internationally
482
+ competitive
483
+ ? Ensure structural change
484
+ Focus areas
485
+ 7.1. Productive sectors
486
+ Mining
487
+ Objectives
488
+ � Foreign exchange earning and
489
+ domestic revenues
490
+ � Increased investment in mining
491
+ � Participation of manufacturing
492
+ industries that add value
493
+ � Job creation
494
+ � Add value for improved contribution of the subsector
495
+ � Increase inter-sectoral linkages to raise raw material inputs to other
496
+ sectors
497
+ � Make mining a competent subsector and induce structural change
498
+ � Increase human resource and technological capabilities through
499
+ research and trainings
500
+ � Raise foreign exchange revenue from mining through increased
501
+ exploration and production
502
+ � Improve traditional mining production and marketing systems
503
+ � Improve the country�s geological information
504
+ Focus areas
505
+ 7.2. Service sector
506
+ Tourism
507
+ Objectives
508
+ � Identification and developing destinations
509
+ � Infrastructure
510
+ � Competitiveness
511
+ ?improve existing destinations
512
+ ?develop new destinations
513
+ ? diversify service and raise quality
514
+ � Market linkages, branding, and promotion
515
+ � Technology, research and development
516
+ � Preservation, maintenance and proper
517
+ utilization of heritage resources
518
+ � Expand job opportunities
519
+ � Raise incomes
520
+ � Build information management
521
+ systems
522
+ � Increase implementation capacity
523
+ Focus areas
524
+ 7.3. Enabling sectors
525
+ Urban development
526
+ Objectives
527
+ ? Prioritize productive sectors in job creation and enterprise
528
+ development plans
529
+ ? Rapid development and equity goals in land provision system
530
+ ? Participation of indigenous people in land redevelopment and
531
+ expansion
532
+ ? Urban land registration and cadaster system, modern
533
+ property valuation
534
+ ? Greenery and public spaces as well as waste disposal and
535
+ management in urban planning and implementation
536
+ ? Housing development and financing options to reduce
537
+ housing shortages
538
+ ? Integrated infrastructure and services provision
539
+ ? Role of private sector in infrastructure development and
540
+ service provision
541
+ � Expand micro and small-scale
542
+ enterprises to reduce urban
543
+ unemployment
544
+ � Develop and avail urban land based on
545
+ demand, equity and cost effectiveness
546
+ � Make quality housing accessible both in
547
+ rural and urban areas
548
+ � Develop quality and integrated
549
+ infrastructure as well as service
550
+ provision in towns
551
+ � Improve financial management and
552
+ resource utilization in urban areas
553
+ Focus areas
554
+ 7.3. Enabling sectors
555
+ Innovation and Technology
556
+ Objectives
557
+ ? Access to innovation and
558
+ technological information
559
+ ? Developing a digital economy
560
+ ? Productivity enhancement and
561
+ competitiveness
562
+ ? Build a digital economy
563
+ ? Develop national scientific research and technological
564
+ capabilities
565
+ ? Support problem solving research and development of
566
+ technologies necessary for raising production,
567
+ productivity and service provision
568
+ ? Create jobs and capital that are based on technology
569
+ ? Develop technological and data security protection
570
+ systems
571
+ Focus areas
572
+ 7.3. Enabling sectors
573
+ Sustainable finance
574
+ Objectives
575
+ � Access to modern finance and saving culture in rural
576
+ areas
577
+ � Support to the private sector and corporations to
578
+ reinvest profits in productive sectors
579
+ � Role of private financial institutions in manufacturing
580
+ and agriculture
581
+ � Digital revenue collection system
582
+ � Tax equity (contraband, tax evasion, and bringing the
583
+ underground economy to the tax system)
584
+ � Domestic and foreign strategic partnerships
585
+ � Transform financing from short term to long-term,
586
+ sustainable and quality sources
587
+ � Ensure financing quality based on sectoral prioritization
588
+ and reduction of wastage
589
+ � Increase the number of domestic saving institutions both
590
+ in rural and urban areas
591
+ � Support domestic finance with foreign exchange capacity
592
+ and foreign direct investment
593
+ � Modernize domestic revenue collection system
594
+ � Raise voluntary tax payment attitude
595
+ � Bring the informal sector to the formal tax system
596
+ Focus areas
597
+ 7.3. Enabling sectors
598
+ Transport
599
+ Objectives
600
+ � Access to infrastructure
601
+ � Implementation capacity
602
+ � Participation of the private sector and the general
603
+ public
604
+ � Financing capacity
605
+ � Ensure equitable access to transport infrastructure and
606
+ services
607
+ � Improve transport safety
608
+ � Make logistics services fast and reliable
609
+ � Build transport infrastructure and service that is
610
+ resilient to climate change
611
+ Focus areas
612
+ 7.3. Enabling sectors
613
+ Energy
614
+ Objectives
615
+ ? Equity in access to electricity services
616
+ ? Energy access and quality
617
+ ? Alternative sources of energy
618
+ ? Reliability of electricity infrastructure
619
+ ? Investment and income in energy subsector
620
+ � Ensure equitable access to transport
621
+ infrastructure and services
622
+ � Improve transport safety
623
+ � Make logistics services fast and reliable
624
+ � Build transport infrastructure and service that is
625
+ resilient to climate change
626
+ Focus areas
627
+ 7.3. Enabling sectors
628
+ Irrigation
629
+ Objectives
630
+ ? Medium and large scale irrigation infrastructure
631
+ ? Job creation
632
+ ? Share of government expenditure and alternative
633
+ financing options
634
+ ? Institutional capacity and human resource
635
+ development
636
+ ? Improve agricultural output and productivity
637
+ ? Reduce government spending and enhance
638
+ institutional capacity and human resources
639
+ development
640
+ ? Ensure the inclusion of all genders and
641
+ disabled citizens
642
+ ? Develop alternative financing options for
643
+ irrigation development
644
+ Focus areas
645
+ 7.3. Enabling sectors
646
+ Human capital development
647
+ Objectives
648
+ � Make education and training inclusive and equitable by
649
+ harmonizing the system with ability, need and capacity
650
+ � Develop capacity of educational institutions (teacher capacity,
651
+ inputs and technology)
652
+ � Establish education and training quality assurance system
653
+ � Avail free and compulsory education for pre-primary to junior
654
+ secondary levels and free education at the senior secondary levels
655
+ equitably
656
+ � Ensure the relevance of education and training system and
657
+ synchronize education policy with economic and social
658
+ development needs
659
+ � Make the education and training policy compatible with the
660
+ nation�s contemporary capacities as well as global and regional
661
+ market opportunities
662
+ � Enhance commitment, capability and responsibility of citizens
663
+ ? Ensure equitable and quality health services
664
+ ? Raise average life expectancy
665
+ ? Achieve universal health coverage through
666
+ proactive and prevention health system
667
+ ? Curtail preventable maternal and child deaths
668
+ ? Reduce incidences of contagious and noncontagious
669
+ related diseases and deaths
670
+ ? Build capacity for health tourism through
671
+ increased treatment capabilities
672
+ ? Create a healthy society that is free from
673
+ addictions and use technology for supporting
674
+ knowledge led economic development
675
+ Focus areas
676
+ 8 Nationally, regionally and locally balanced and competitive development
677
+ 1. Lack of synchronization of investment with
678
+ resource potentials and development needs
679
+ 2. Poor alignment of federal, regional and
680
+ district level investment plans with the
681
+ national development goals and envisioned
682
+ settlement patterns
683
+ 3. Poor regional coordination due to low
684
+ consideration for trans-regional and
685
+ spatial issues in development plans of
686
+ regional states
687
+ 4. Inter-regional and intra-regional
688
+ disparities in infrastructural development
689
+ and access to services
690
+ Challenges
691
+ 8. Nationally, regionally and locally balanced and competitive development
692
+ 1. Ensure that the investment flow and
693
+ infrastructural development plans fairly go hand in
694
+ hand with resource potential and development
695
+ needs
696
+ ?Developing underutilized natural resources
697
+ ?Equitable distribution and access to
698
+ infrastructure
699
+ ?Sustainable environmental protection
700
+ 2. Ensure the inclusion of pastoral and agro-pastoral
701
+ areas in the development
702
+ ?Focused infrastructural development in pastoral
703
+ areas such as education and health sector input
704
+ provision as well as governance
705
+ ?Market linkages with other areas and the central
706
+ markets
707
+ ?Improve rural finance (credit and insurance) to
708
+ encourage fattening, milk processing, leather
709
+ production and irrigation agriculture
710
+ Focus areas
711
+ 9. Monitoring and Evaluation
712
+ 10 Years Perspective
713
+ Plan KPIs
714
+ Federal Implementing
715
+ Institutions
716
+ Planning and
717
+ Development Commission
718
+ Generate Data (Census,
719
+ Sample and administrative
720
+ data)
721
+ Annual Reports
722
+ Dialogue forums
723
+ (Civic Organizations, professional
724
+ associations, development partners,
725
+ intellectuals)
726
+ Central Statistical Agency
727
+ Database
728
+ National
729
+ Information Portal
730
+ National Statistics
731
+ Development Strategic
732
+ plan
733
+ Evaluation Reports
734
+ Prime Minister�s Office
735
+ House of People�s
736
+ Representatives
737
+ Thank you!
docStore/sample/South Africa_s Low Emission Development Strategy.txt ADDED
The diff for this file is too large to render. See raw diff
 
docStore/sample/files.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"Ethiopia: 10 Year Development Plan":"docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt",
2
+ "South Africa:Low Emission strategy":"docStore/sample/South Africa_s Low Emission Development Strategy.txt"
3
+ }
docStore/sample/keywordexample.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
3
+ "Food":"Food security,Nutrition,Diets,Food loss",
4
+ "Implementation":"Implementation,transformation,reform,integration,strategy,policy",
5
+ "Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
6
+ "Social":"Indigenous,Local community(ies),Rural livelihoods,Minority",
7
+ "Gender":"gender, women empowernment, women economic power, gender bias"
8
+ }
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ poppler-utils
2
+ xpdf
3
+ tesseract-ocr
4
+ libtesseract-dev
paramconfig.cfg ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [lexical_search]
2
+ TOP_K = 20
3
+ SPLIT_BY = word
4
+ SPLIT_LENGTH = 120
5
+ SPLIT_OVERLAP = 0
6
+ REMOVE_PUNC = 0
7
+
8
+ [semantic_search]
9
+ RETRIEVER_TOP_K = 10
10
+ MAX_SEQ_LENGTH = 384
11
+ RETRIEVER = all-mpnet-base-v2
12
+ RETRIEVER_FORMAT = sentence_transformers
13
+ EMBEDDING_DIM = 768
14
+ RETRIEVER_EMB_LAYER = -1
15
+ READER = deepset/tinyroberta-squad2
16
+ READER_TOP_K = 10
17
+ READER_TOP_K_PER_CANDIDATE = 1
18
+ SPLIT_BY = word
19
+ SPLIT_LENGTH = 120
20
+ SPLIT_OVERLAP = 10
21
+ RESPECT_SENTENCE_BOUNDARY = 1
22
+ REMOVE_PUNC = 0
23
+
24
+ [sdg]
25
+ THRESHOLD = 0.85
26
+ MODEL = jonas/bert-base-uncased-finetuned-sdg
27
+ SPLIT_BY = word
28
+ REMOVE_PUNC = 0
29
+ SPLIT_LENGTH = 120
30
+ SPLIT_OVERLAP = 10
31
+ RESPECT_SENTENCE_BOUNDARY = 1
32
+ TOP_KEY = 15
33
+
34
+
35
+ [coherence]
36
+ RETRIEVER_TOP_K = 10
37
+ MAX_SEQ_LENGTH = 512
38
+ RETRIEVER = msmarco-distilbert-dot-v5
39
+ RETRIEVER_FORMAT = sentence_transformers
40
+ RETRIEVER_EMB_LAYER = -1
41
+ EMBEDDING_DIM = 768
42
+ THRESHOLD = 0.55
43
+ SPLIT_BY = word
44
+ SPLIT_LENGTH = 120
45
+ SPLIT_OVERLAP = 10
46
+ RESPECT_SENTENCE_BOUNDARY = 1
47
+ REMOVE_PUNC = 0
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ farm-haystack == 1.10
2
+ farm-haystack[ocr]==1.10.0
3
+ spacy==3.2.0
4
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
5
+ matplotlib==3.5.1
6
+ nltk==3.7
7
+ numpy==1.22.1
8
+ pandas==1.4.0
9
+ pdfplumber==0.6.2
10
+ Pillow==9.1.1
11
+ seaborn==0.11.2
12
+ transformers==4.21.2
13
+ st-annotated-text==3.0.0
14
+ markdown==3.4.1
15
+ summa==1.2.0
16
+ altair==4.0
17
+ streamlit-aggrid
18
+ python-docx
19
+ streamlit_option_menu
style.css ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .row-widget.stTextInput > div:first-of-type {
3
+ background: #fff;
4
+ display: flex;
5
+ border: 1px solid #dfe1e5;
6
+ box-shadow: none;
7
+ border-radius: 24px;
8
+ height: 50px;
9
+ width: auto;
10
+ margin: 10px auto 30px;
11
+ }
12
+
13
+ .row-widget.stTextInput > div:first-of-type:hover,
14
+ .row-widget.stTextInput > div:first-of-type:focus {
15
+ box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
16
+ }
17
+
18
+ .row-widget.stTextInput .st-bq {
19
+ background-color: #fff;
20
+ }
21
+
22
+ .row-widget.stTextInput > label {
23
+ color: #b3b3b3;
24
+ }
25
+
26
+ .row-widget.stButton > button {
27
+ border-radius: 24px;
28
+ background-color: #B6C9B1;
29
+ color: #fff;
30
+ border: none;
31
+ padding: 6px 20px;
32
+ float: right;
33
+ background-image: none;
34
+ }
35
+
36
+ .row-widget.stButton > button:hover {
37
+ box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
38
+ }
39
+
40
+ .row-widget.stButton > button:focus {
41
+ border: none;
42
+ color: #fff;
43
+ }
44
+
45
+ .footer-custom {
46
+ position: fixed;
47
+ bottom: 0;
48
+ width: 100%;
49
+ color: var(--text-color);
50
+ max-width: 698px;
51
+ font-size: 14px;
52
+ height: 50px;
53
+ padding: 10px 0;
54
+ z-index: 50;
55
+ }
56
+
57
+ .main {
58
+ padding: 20px;
59
+ }
60
+
61
+ footer {
62
+ display: none !important;
63
+ }
64
+
65
+ .footer-custom a {
66
+ color: var(--text-color);
67
+ }
68
+
69
+ #wikipedia-assistant {
70
+ font-size: 36px;
71
+ }
72
+
73
+ .generated-answer p {
74
+ font-size: 16px;
75
+ font-weight: bold;
76
+ }
77
+
78
+ .react-json-view {
79
+ margin: 40px 0 80px;
80
+ }
81
+
82
+ .tooltip {
83
+ text-align: center;
84
+ line-height: 20px;
85
+ display: table-caption;
86
+ font-size: 10px;
87
+ border-radius: 50%;
88
+ height: 20px;
89
+ width: 20px;
90
+ position: relative;
91
+ cursor: pointer;
92
+ color:#000;
93
+ }
94
+
95
+ .tooltip .tooltiptext {
96
+ visibility: hidden;
97
+ width: 280px;
98
+ text-align: center;
99
+ border-radius: 6px;
100
+ padding: 10px;
101
+ position: absolute;
102
+ z-index: 1;
103
+ top: 25px;
104
+ left: 50%;
105
+ margin-left: -140px;
106
+ font-size: 14px;
107
+ background-color: #fff;
108
+ border: 1px solid #ccc;
109
+ box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
110
+ color: #000;
111
+ }
112
+
113
+ .tooltip:hover .tooltiptext {
114
+ visibility: visible;
115
+ }
116
+
117
+ .sentence-wrapper {
118
+ border-left: 4px solid #ffc423;
119
+ padding-left: 20px;
120
+ margin-bottom: 40px;
121
+ }
122
+
123
+ #context {
124
+ padding: 2rem 0 1rem;
125
+ }
126
+
127
+ hr {
128
+ margin: 2em 0 1em;
129
+ }
130
+
131
+
132
+ .technical-details-info {
133
+ margin-bottom: 100px;
134
+ }
135
+
136
+ .loader-wrapper {
137
+ display: flex;
138
+ align-items: center;
139
+ background-color: rgba(250, 202, 43, 0.2);
140
+ padding: 15px 20px;
141
+ border-radius: 6px;
142
+ }
143
+
144
+ .loader-wrapper p {
145
+ margin-bottom: 0;
146
+ margin-left: 20px;
147
+ }
148
+
149
+ .loader {
150
+ width: 30px;
151
+ height: 30px;
152
+ border: dotted 5px #868686;
153
+ border-radius: 100%;
154
+ animation: spin 1s linear infinite;
155
+ }
156
+
157
+ .loader-note {
158
+ font-size: 14px;
159
+ color: #b3b3b3;
160
+ margin-left: 5px;
161
+ }
162
+
163
+ @keyframes spin {
164
+ 0% {
165
+ transform: rotate(0deg) scale(0.8);
166
+ border-top-color: transparent;
167
+ border-right-color: transparent;
168
+ }
169
+ 50% { transform: rotate(180deg) scale(1.2);
170
+ border-color: #949494;
171
+ border-top-color: transparent;
172
+ border-right-color: transparent;
173
+ }
174
+ 100% { transform: rotate(360deg) scale(0.8);
175
+ border-color: #bbbbbb;
176
+ border-top-color: transparent;
177
+ border-right-color: transparent;
178
+ }
179
+ }
180
+
utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # adding for package implementation
utils/checkconfig.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import logging
3
+
4
+ def getconfig(configfile_path:str):
5
+ """
6
+ configfile_path: file path of .cfg file
7
+ """
8
+
9
+ config = configparser.ConfigParser()
10
+
11
+ try:
12
+ config.read_file(open(configfile_path))
13
+ return config
14
+ except:
15
+ logging.warning("config file not found")
utils/keyword_extraction.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
3
+ # import nltk
4
+ # nltk.download('stopwords')
5
+ # from nltk.corpus import stopwords
6
+ import pickle
7
+ from typing import List, Text
8
+ import logging
9
+ from summa import keywords
10
+
11
+ try:
12
+ import streamlit as st
13
+ except ImportError:
14
+ logging.info("Streamlit not installed")
15
+
16
+
17
+ def sort_coo(coo_matrix):
18
+ """
19
+ It takes Coordinate format scipy sparse matrix and extracts info from same.\
20
+ 1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
21
+ """
22
+ tuples = zip(coo_matrix.col, coo_matrix.data)
23
+ return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
24
+
25
+ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
26
+ """get the feature names and tf-idf score of top n items
27
+
28
+ Params
29
+ ---------
30
+ feature_names: list of words from vectorizer
31
+ sorted_items: tuple returned by sort_coo function defined in \
32
+ keyword_extraction.py
33
+ topn: topn words to be extracted using tfidf
34
+
35
+ Return
36
+ ----------
37
+ results: top extracted keywords
38
+
39
+ """
40
+
41
+ #use only topn items from vector
42
+ sorted_items = sorted_items[:top_n]
43
+ score_vals = []
44
+ feature_vals = []
45
+
46
+ # word index and corresponding tf-idf score
47
+ for idx, score in sorted_items:
48
+
49
+ #keep track of feature name and its corresponding score
50
+ score_vals.append(round(score, 3))
51
+ feature_vals.append(feature_names[idx])
52
+
53
+ results= {}
54
+ for idx in range(len(feature_vals)):
55
+ results[feature_vals[idx]]=score_vals[idx]
56
+
57
+ return results
58
+
59
+
60
+ def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
61
+ """
62
+ TFIDF based keywords extraction
63
+
64
+ Params
65
+ ---------
66
+ vectorizer: trained cont vectorizer model
67
+ tfidfmodel: TFIDF Tranformer model
68
+ top_n: Top N keywords to be extracted
69
+ textdata: text data to which needs keyword extraction
70
+
71
+ Return
72
+ ----------
73
+ keywords: top extracted keywords
74
+
75
+ """
76
+ features = vectorizer.get_feature_names_out()
77
+ tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
78
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
79
+ results=extract_topn_from_vector(features,sorted_items,top_n)
80
+ keywords = [keyword for keyword in results]
81
+ return keywords
82
+
83
+ def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
84
+ """
85
+ TFIDF based keywords extraction
86
+
87
+ Params
88
+ ---------
89
+ sdg: which sdg tfidf model to be used
90
+ sdgdata: text data to which needs keyword extraction
91
+
92
+
93
+ Return
94
+ ----------
95
+ keywords: top extracted keywords
96
+
97
+ """
98
+ model_path = "docStore/sdg{}/".format(sdg)
99
+ vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
100
+ tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
101
+ features = vectorizer.get_feature_names_out()
102
+ tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
103
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
104
+ top_n = top_n
105
+ results=extract_topn_from_vector(features,sorted_items,top_n)
106
+ keywords = [keyword for keyword in results]
107
+ return keywords
108
+
109
+ @st.cache(allow_output_mutation=True)
110
+ def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
111
+ """
112
+ wrappper function to perform textrank, uses either ratio or wordcount to
113
+ extract top keywords limited by words or ratio.
114
+ 1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
115
+
116
+ Params
117
+ --------
118
+ textdata: text data to perform the textrank.
119
+ ratio: float to limit the number of keywords as proportion of total token \
120
+ in textdata
121
+ words: number of keywords to be extracted. Takes priority over ratio if \
122
+ Non zero. Howevr incase the pagerank returns lesser keywords than \
123
+ compared to fix value then ratio is used.
124
+
125
+ Return
126
+ --------
127
+ results: extracted keywords
128
+ """
129
+ if words == 0:
130
+ logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
131
+ results = keywords.keywords(textdata, ratio= ratio).split("\n")
132
+ else:
133
+ try:
134
+ results = keywords.keywords(textdata, words= words).split("\n")
135
+ except:
136
+ results = keywords.keywords(textdata, ratio = ratio).split("\n")
137
+
138
+ return results
139
+
140
+
utils/lexical_search.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TfidfRetriever
2
+ from haystack.document_stores import InMemoryDocumentStore
3
+ import spacy
4
+ import re
5
+ from spacy.matcher import Matcher
6
+ from markdown import markdown
7
+ from annotated_text import annotation
8
+ from haystack.schema import Document
9
+ from typing import List, Text, Tuple
10
+ from typing_extensions import Literal
11
+ from utils.preprocessing import processingpipeline
12
+ from utils.streamlitcheck import check_streamlit
13
+ import logging
14
+ try:
15
+ from termcolor import colored
16
+ except:
17
+ pass
18
+
19
+ try:
20
+ import streamlit as st
21
+ except ImportError:
22
+ logging.info("Streamlit not installed")
23
+
24
+
25
+ def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
26
+ split_by: Literal["sentence", "word"] = 'word',
27
+ split_length:int = 80, split_overlap:int = 0,
28
+ remove_punc:bool = False,)->List[Document]:
29
+ """
30
+ creates the pipeline and runs the preprocessing pipeline,
31
+ the params for pipeline are fetched from paramconfig. As lexical doesnt gets
32
+ affected by overlap, threfore split_overlap = 0 in default paramconfig and
33
+ split_by = word.
34
+
35
+ Params
36
+ ------------
37
+
38
+ file_name: filename, in case of streamlit application use
39
+ st.session_state['filename']
40
+ file_path: filepath, in case of streamlit application use
41
+ st.session_state['filepath']
42
+ split_by: document splitting strategy either as word or sentence
43
+ split_length: when synthetically creating the paragrpahs from document,
44
+ it defines the length of paragraph.
45
+ split_overlap: Number of words or sentences that overlap when creating
46
+ the paragraphs. This is done as one sentence or 'some words' make sense
47
+ when read in together with others. Therefore the overlap is used.
48
+ splititng of text.
49
+ removePunc: to remove all Punctuation including ',' and '.' or not
50
+
51
+ Return
52
+ --------------
53
+ List[Document]: When preprocessing pipeline is run, the output dictionary
54
+ has four objects. For the lexicaal search using TFIDFRetriever we
55
+ need to use the List of Haystack Document, which can be fetched by
56
+ key = 'documents' on output.
57
+
58
+ """
59
+
60
+ lexical_processing_pipeline = processingpipeline()
61
+
62
+
63
+ output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
64
+ params= {"FileConverter": {"file_path": file_path, \
65
+ "file_name": file_name},
66
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
67
+ "split_by": split_by, \
68
+ "split_length":split_length,\
69
+ "split_overlap": split_overlap}})
70
+
71
+ return output_lexical_pre
72
+
73
+
74
+ def tokenize_lexical_query(query:str)-> List[str]:
75
+ """
76
+ Removes the stop words from query and returns the list of important keywords
77
+ in query. For the lexical search the relevent paragraphs in document are
78
+ retreived using TfIDFretreiver from Haystack. However to highlight these
79
+ keywords we need the tokenized form of query.
80
+
81
+ Params
82
+ --------
83
+ query: string which represents either list of keywords user is looking for
84
+ or a query in form of Question.
85
+
86
+ Return
87
+ -----------
88
+ token_list: list of important keywords in the query.
89
+
90
+ """
91
+ nlp = spacy.load("en_core_web_sm")
92
+ token_list = [token.text.lower() for token in nlp(query)
93
+ if not (token.is_stop or token.is_punct)]
94
+ return token_list
95
+
96
+ def runSpacyMatcher(token_list:List[str], document:Text
97
+ )->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
98
+ """
99
+ Using the spacy in backend finds the keywords in the document using the
100
+ Matcher class from spacy. We can alternatively use the regex, but spacy
101
+ finds all keywords in serialized manner which helps in annotation of answers.
102
+
103
+ Params
104
+ -------
105
+ token_list: this is token list which tokenize_lexical_query function returns
106
+ document: text in which we need to find the tokens
107
+
108
+ Return
109
+ --------
110
+ matches: List of [start_index, end_index] in the spacydoc(at word level not
111
+ character) for the keywords in token list.
112
+
113
+ spacydoc: the keyword index in the spacydoc are at word level and not character,
114
+ therefore to allow the annotator to work seamlessly we return the spacydoc.
115
+
116
+ """
117
+ nlp = spacy.load("en_core_web_sm")
118
+ spacydoc = nlp(document)
119
+ matcher = Matcher(nlp.vocab)
120
+ token_pattern = [[{"LOWER":token}] for token in token_list]
121
+ matcher.add(",".join(token_list), token_pattern)
122
+ spacymatches = matcher(spacydoc)
123
+
124
+ # getting start and end index in spacydoc so that annotator can work seamlessly
125
+ matches = []
126
+ for match_id, start, end in spacymatches:
127
+ matches = matches + [[start, end]]
128
+
129
+ return matches, spacydoc
130
+
131
+ def runRegexMatcher(token_list:List[str], document:Text):
132
+ """
133
+ Using the regex in backend finds the keywords in the document.
134
+
135
+ Params
136
+ -------
137
+ token_list: this is token list which tokenize_lexical_query function returns
138
+
139
+ document: text in which we need to find the tokens
140
+
141
+ Return
142
+ --------
143
+ matches: List of [start_index, end_index] in the document for the keywords
144
+ in token list at character level.
145
+
146
+ document: the keyword index returned by regex are at character level,
147
+ therefore to allow the annotator to work seamlessly we return the text back.
148
+
149
+ """
150
+ matches = []
151
+ for token in token_list:
152
+ matches = (matches +
153
+ [[val.start(), val.start() +
154
+ len(token)] for val in re.finditer(token, document)])
155
+
156
+ return matches, document
157
+
158
+ def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
159
+ """
160
+ This is spacy Annotator and needs spacy.doc
161
+ Annotates the text in the document defined by list of [start index, end index]
162
+ Example: "How are you today", if document type is text, matches = [[0,3]]
163
+ will give answer = "How", however in case we used the spacy matcher then the
164
+ matches = [[0,3]] will give answer = "How are you". However if spacy is used
165
+ to find "How" then the matches = [[0,1]] for the string defined above.
166
+
167
+ Params
168
+ -----------
169
+ matches: As mentioned its list of list. Example [[0,1],[10,13]]
170
+ document: document which needs to be indexed.
171
+
172
+
173
+ Return
174
+ --------
175
+ will send the output to either app front end using streamlit or
176
+ write directly to output screen.
177
+
178
+ """
179
+ start = 0
180
+ annotated_text = ""
181
+ for match in matches:
182
+ start_idx = match[0]
183
+ end_idx = match[1]
184
+
185
+ if check_streamlit():
186
+ annotated_text = (annotated_text + document[start:start_idx].text
187
+ + str(annotation(body=document[start_idx:end_idx].text,
188
+ label="ANSWER", background="#964448", color='#ffffff')))
189
+ else:
190
+ annotated_text = (annotated_text + document[start:start_idx].text
191
+ + colored(document[start_idx:end_idx].text,
192
+ "green", attrs = ['bold']))
193
+
194
+
195
+ start = end_idx
196
+
197
+ annotated_text = annotated_text + document[end_idx:].text
198
+
199
+
200
+ if check_streamlit():
201
+
202
+ st.write(
203
+ markdown(annotated_text),
204
+ unsafe_allow_html=True,
205
+ )
206
+ else:
207
+ print(annotated_text)
208
+
209
+ def lexical_search(query:Text, documents:List[Document],top_k:int):
210
+ """
211
+ Performs the Lexical search on the List of haystack documents which is
212
+ returned by preprocessing Pipeline.
213
+
214
+ Params
215
+ -------
216
+ query: Keywords that need to be searche in documents.
217
+ documents: List of Haystack documents returned by preprocessing pipeline.
218
+ top_k: Number of Top results to be fetched.
219
+
220
+ """
221
+
222
+ document_store = InMemoryDocumentStore()
223
+ document_store.write_documents(documents)
224
+
225
+ # Haystack Retriever works with document stores only.
226
+ retriever = TfidfRetriever(document_store)
227
+ results = retriever.retrieve(query=query, top_k = top_k)
228
+ query_tokens = tokenize_lexical_query(query)
229
+ flag = True
230
+ for count, result in enumerate(results):
231
+ matches, doc = runSpacyMatcher(query_tokens,result.content)
232
+
233
+ if len(matches) != 0:
234
+ if flag:
235
+ flag = False
236
+ if check_streamlit():
237
+ st.markdown("##### Top few lexical search (TFIDF) hits #####")
238
+ else:
239
+ print("Top few lexical search (TFIDF) hits")
240
+
241
+ if check_streamlit():
242
+ st.write("Result {}".format(count+1))
243
+ else:
244
+ print("Results {}".format(count +1))
245
+ spacyAnnotator(matches, doc)
246
+
247
+ if flag:
248
+ if check_streamlit():
249
+ st.info("🤔 No relevant result found. Please try another keyword.")
250
+ else:
251
+ print("No relevant result found. Please try another keyword.")
utils/ndc_explorer.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import urllib.request
3
+ import json
4
+
5
+ link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
6
+ def get_document(country_code: str):
7
+ """
8
+ read the country NDC data from
9
+ https://klimalog.die-gdi.de/ndc/open-data/dataset.json
10
+ using the country code.
11
+
12
+ Params
13
+ -------
14
+ country_code:"""
15
+ with urllib.request.urlopen(link) as urlfile:
16
+ data = json.loads(urlfile.read())
17
+ categoriesData = {}
18
+ categoriesData['categories']= data['categories']
19
+ categoriesData['subcategories']= data['subcategories']
20
+ keys_sub = categoriesData['subcategories'].keys()
21
+ documentType= 'NDCs'
22
+ if documentType in data.keys():
23
+ if country_code in data[documentType].keys():
24
+ get_dict = {}
25
+ for key, value in data[documentType][country_code].items():
26
+ if key not in ['country_name','region_id', 'region_name']:
27
+ get_dict[key] = value['classification']
28
+ else:
29
+ get_dict[key] = value
30
+ else:
31
+ return None
32
+ else:
33
+ return None
34
+
35
+ country = {}
36
+ for key in categoriesData['categories']:
37
+ country[key]= {}
38
+ for key,value in categoriesData['subcategories'].items():
39
+ country[value['category']][key] = get_dict[key]
40
+
41
+ return country
42
+
43
+
44
+ def countrySpecificCCA(cca_sent:dict, threshold:int, countryCode:str):
45
+ """
46
+ based on the countrycode, reads the country data from
47
+ https://klimalog.die-gdi.de/ndc/open-data/dataset.json
48
+ using get_documents from utils.ndc_explorer.py
49
+ then based on thereshold value filters the Climate Change Adaptation
50
+ targets assigned by NDC explorer team to that country. Using the sentences
51
+ create by Data services team of GIZ for each target level, tries to find the
52
+ relevant passages from the document by doing the semantic search.
53
+
54
+ Params
55
+ -------
56
+ cca_sent: dictionary with key as 'target labels' and manufactured sentences
57
+ reflecting the target level. Please see the docStore/ndcs/cca.txt
58
+
59
+ threshold: NDC target have many categoriees ranging from [0-5], with 0
60
+ refelcting most relaxed attitude and 5 being most aggrisive towards Climate
61
+ change. We select the threshold value beyond which we need to focus on.
62
+
63
+ countryCode: standard country code to allow us to fetch the country specific
64
+ data.
65
+
66
+ """
67
+ temp = {}
68
+ doc = get_document(countryCode)
69
+ for key,value in cca_sent.items():
70
+ id_ = doc['climate change adaptation'][key]['id']
71
+ if id_ >threshold:
72
+ temp[key] = value['id'][id_]
73
+ return temp
74
+
75
+
76
+ def countrySpecificCCM(ccm_sent, threshold, countryCode):
77
+ """
78
+ see the documentation of countrySpecificCCA. This is same instead of
79
+ this gets the data pertaining to Adaptation
80
+
81
+ """
82
+
83
+ temp = {}
84
+ doc = get_document(countryCode)
85
+ for key,value in ccm_sent.items():
86
+ id_ = doc['climate change mitigation'][key]['id']
87
+ if id_ >threshold:
88
+ temp[key] = value['id'][id_]
89
+
90
+ return temp
utils/preprocessing.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes.base import BaseComponent
2
+ from haystack.schema import Document
3
+ from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
4
+ from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
5
+ from typing import Callable, Dict, List, Optional, Text, Tuple, Union
6
+ from typing_extensions import Literal
7
+ import pandas as pd
8
+ import logging
9
+ import re
10
+ import string
11
+ from haystack.pipelines import Pipeline
12
+
13
+ def useOCR(file_path: str)-> Text:
14
+ """
15
+ Converts image pdfs into text, Using the Farm-haystack[OCR]
16
+
17
+ Params
18
+ ----------
19
+ file_path: file_path of uploade file, returned by add_upload function in
20
+ uploadAndExample.py
21
+
22
+ Returns the text file as string.
23
+ """
24
+
25
+
26
+ converter = PDFToTextOCRConverter(remove_numeric_tables=True,
27
+ valid_languages=["eng"])
28
+ docs = converter.convert(file_path=file_path, meta=None)
29
+ return docs[0].content
30
+
31
+
32
+
33
+
34
+ class FileConverter(BaseComponent):
35
+ """
36
+ Wrapper class to convert uploaded document into text by calling appropriate
37
+ Converter class, will use internally haystack PDFToTextOCR in case of image
38
+ pdf. Cannot use the FileClassifier from haystack as its doesnt has any
39
+ label/output class for image.
40
+
41
+ 1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
42
+ 2. https://docs.haystack.deepset.ai/docs/file_converters
43
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
44
+ 4. https://docs.haystack.deepset.ai/reference/file-converters-api
45
+
46
+
47
+ """
48
+
49
+ outgoing_edges = 1
50
+
51
+ def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
52
+ id_hash_keys: Optional[List[str]] = None,
53
+ ) -> Tuple[dict,str]:
54
+ """ this is required method to invoke the component in
55
+ the pipeline implementation.
56
+
57
+ Params
58
+ ----------
59
+ file_name: name of file
60
+ file_path: file_path of uploade file, returned by add_upload function in
61
+ uploadAndExample.py
62
+
63
+ See the links provided in Class docstring/description to see other params
64
+
65
+ Return
66
+ ---------
67
+ output: dictionary, with key as identifier and value could be anything
68
+ we need to return. In this case its the List of Hasyatck Document
69
+
70
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
71
+ """
72
+ try:
73
+ if file_name.endswith('.pdf'):
74
+ converter = PDFToTextConverter(remove_numeric_tables=True)
75
+ if file_name.endswith('.txt'):
76
+ converter = TextConverter(remove_numeric_tables=True)
77
+ if file_name.endswith('.docx'):
78
+ converter = DocxToTextConverter()
79
+ except Exception as e:
80
+ logging.error(e)
81
+ return
82
+
83
+
84
+
85
+ documents = []
86
+
87
+ document = converter.convert(
88
+ file_path=file_path, meta=None,
89
+ encoding=encoding, id_hash_keys=id_hash_keys
90
+ )[0]
91
+
92
+ text = document.content
93
+
94
+ # if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
95
+ # subsitute this substring with '',and check if content is empty string
96
+
97
+ text = re.sub(r'\x0c', '', text)
98
+ documents.append(Document(content=text,
99
+ meta={"name": file_name},
100
+ id_hash_keys=id_hash_keys))
101
+
102
+
103
+ # check if text is empty and apply pdfOCR converter.
104
+ for i in documents:
105
+ if i.content == "":
106
+ logging.info("Using OCR")
107
+ i.content = useOCR(file_path)
108
+
109
+ logging.info('file conversion succesful')
110
+ output = {'documents': documents}
111
+ return output, 'output_1'
112
+
113
+ def run_batch():
114
+ """
115
+ we dont have requirement to process the multiple files in one go
116
+ therefore nothing here, however to use the custom node we need to have
117
+ this method for the class.
118
+ """
119
+
120
+ return
121
+
122
+
123
+ def basic(s:str, remove_punc:bool = False):
124
+
125
+ """
126
+ Performs basic cleaning of text.
127
+
128
+ Params
129
+ ----------
130
+ s: string to be processed
131
+ removePunc: to remove all Punctuation including ',' and '.' or not
132
+
133
+ Returns: processed string: see comments in the source code for more info
134
+ """
135
+
136
+ # Remove URLs
137
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
138
+ s = re.sub(r"http\S+", " ", s)
139
+
140
+ # Remove new line characters
141
+ s = re.sub('\n', ' ', s)
142
+
143
+ # Remove punctuations
144
+ if remove_punc == True:
145
+ translator = str.maketrans(' ', ' ', string.punctuation)
146
+ s = s.translate(translator)
147
+ # Remove distracting single quotes and dotted pattern
148
+ s = re.sub("\'", " ", s)
149
+ s = s.replace("..","")
150
+
151
+ return s.strip()
152
+
153
+
154
+ class UdfPreProcessor(BaseComponent):
155
+ """
156
+ class to preprocess the document returned by FileConverter. It will check
157
+ for splitting strategy and splits the document by word or sentences and then
158
+ synthetically create the paragraphs.
159
+
160
+ 1. https://docs.haystack.deepset.ai/docs/preprocessor
161
+ 2. https://docs.haystack.deepset.ai/reference/preprocessor-api
162
+ 3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
163
+
164
+ """
165
+ outgoing_edges = 1
166
+
167
+ def run(self, documents:List[Document], remove_punc:bool=False,
168
+ split_by: Literal["sentence", "word"] = 'sentence',
169
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
170
+ split_overlap:int = 0):
171
+
172
+ """ this is required method to invoke the component in
173
+ the pipeline implementation.
174
+
175
+ Params
176
+ ----------
177
+ documents: documents from the output dictionary returned by Fileconverter
178
+ remove_punc: to remove all Punctuation including ',' and '.' or not
179
+ split_by: document splitting strategy either as word or sentence
180
+ split_length: when synthetically creating the paragrpahs from document,
181
+ it defines the length of paragraph.
182
+ split_respect_sentence_boundary: Used when using 'word' strategy for
183
+ splititng of text.
184
+ split_overlap: Number of words or sentences that overlap when creating
185
+ the paragraphs. This is done as one sentence or 'some words' make sense
186
+ when read in together with others. Therefore the overlap is used.
187
+
188
+ Return
189
+ ---------
190
+ output: dictionary, with key as identifier and value could be anything
191
+ we need to return. In this case the output will contain 4 objects
192
+ the paragraphs text list as List, Haystack document, Dataframe and
193
+ one raw text file.
194
+
195
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
196
+
197
+ """
198
+
199
+ if split_by == 'sentence':
200
+ split_respect_sentence_boundary = False
201
+
202
+ else:
203
+ split_respect_sentence_boundary = split_respect_sentence_boundary
204
+
205
+ preprocessor = PreProcessor(
206
+ clean_empty_lines=True,
207
+ clean_whitespace=True,
208
+ clean_header_footer=True,
209
+ split_by=split_by,
210
+ split_length=split_length,
211
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
212
+ split_overlap=split_overlap,
213
+
214
+ # will add page number only in case of PDF not for text/docx file.
215
+ add_page_number=True
216
+ )
217
+
218
+ for i in documents:
219
+ # # basic cleaning before passing it to preprocessor.
220
+ # i = basic(i)
221
+ docs_processed = preprocessor.process([i])
222
+ for item in docs_processed:
223
+ item.content = basic(item.content, remove_punc= remove_punc)
224
+
225
+ df = pd.DataFrame(docs_processed)
226
+ all_text = " ".join(df.content.to_list())
227
+ para_list = df.content.to_list()
228
+ logging.info('document split into {} paragraphs'.format(len(para_list)))
229
+ output = {'documents': docs_processed,
230
+ 'dataframe': df,
231
+ 'text': all_text,
232
+ 'paraList': para_list
233
+ }
234
+ return output, "output_1"
235
+ def run_batch():
236
+ """
237
+ we dont have requirement to process the multiple files in one go
238
+ therefore nothing here, however to use the custom node we need to have
239
+ this method for the class.
240
+ """
241
+ return
242
+
243
+ def processingpipeline():
244
+ """
245
+ Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
246
+ from utils.preprocessing
247
+
248
+ """
249
+
250
+ preprocessing_pipeline = Pipeline()
251
+ file_converter = FileConverter()
252
+ custom_preprocessor = UdfPreProcessor()
253
+
254
+ preprocessing_pipeline.add_node(component=file_converter,
255
+ name="FileConverter", inputs=["File"])
256
+ preprocessing_pipeline.add_node(component = custom_preprocessor,
257
+ name ='UdfPreProcessor', inputs=["FileConverter"])
258
+
259
+ return preprocessing_pipeline
260
+
utils/sdg_classifier.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TransformersDocumentClassifier
2
+ from haystack.schema import Document
3
+ from typing import List, Tuple
4
+ from typing_extensions import Literal
5
+ import logging
6
+ import pandas as pd
7
+ from pandas import DataFrame, Series
8
+ from utils.checkconfig import getconfig
9
+ from utils.streamlitcheck import check_streamlit
10
+ from utils.preprocessing import processingpipeline
11
+ try:
12
+ import streamlit as st
13
+ except ImportError:
14
+ logging.info("Streamlit not installed")
15
+
16
+ ## Labels dictionary ###
17
+ _lab_dict = {0: 'no_cat',
18
+ 1:'SDG 1 - No poverty',
19
+ 2:'SDG 2 - Zero hunger',
20
+ 3:'SDG 3 - Good health and well-being',
21
+ 4:'SDG 4 - Quality education',
22
+ 5:'SDG 5 - Gender equality',
23
+ 6:'SDG 6 - Clean water and sanitation',
24
+ 7:'SDG 7 - Affordable and clean energy',
25
+ 8:'SDG 8 - Decent work and economic growth',
26
+ 9:'SDG 9 - Industry, Innovation and Infrastructure',
27
+ 10:'SDG 10 - Reduced inequality',
28
+ 11:'SDG 11 - Sustainable cities and communities',
29
+ 12:'SDG 12 - Responsible consumption and production',
30
+ 13:'SDG 13 - Climate action',
31
+ 14:'SDG 14 - Life below water',
32
+ 15:'SDG 15 - Life on land',
33
+ 16:'SDG 16 - Peace, justice and strong institutions',
34
+ 17:'SDG 17 - Partnership for the goals',}
35
+
36
+ @st.cache(allow_output_mutation=True)
37
+ def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
38
+ """
39
+ loads the document classifier using haystack, where the name/path of model
40
+ in HF-hub as string is used to fetch the model object.Either configfile or
41
+ model should be passed.
42
+ 1. https://docs.haystack.deepset.ai/reference/document-classifier-api
43
+ 2. https://docs.haystack.deepset.ai/docs/document_classifier
44
+
45
+ Params
46
+ --------
47
+ config_file: config file path from which to read the model name
48
+ classifier_name: if modelname is passed, it takes a priority if not \
49
+ found then will look for configfile, else raise error.
50
+
51
+
52
+ Return: document classifier model
53
+ """
54
+ if not classifier_name:
55
+ if not config_file:
56
+ logging.warning("Pass either model name or config file")
57
+ return
58
+ else:
59
+ config = getconfig(config_file)
60
+ classifier_name = config.get('sdg','MODEL')
61
+
62
+ logging.info("Loading classifier")
63
+ doc_classifier = TransformersDocumentClassifier(
64
+ model_name_or_path=classifier_name,
65
+ task="text-classification")
66
+
67
+ return doc_classifier
68
+
69
+
70
+ @st.cache(allow_output_mutation=True)
71
+ def sdg_classification(haystack_doc:List[Document],
72
+ threshold:float = 0.8,
73
+ classifier_model:TransformersDocumentClassifier= None
74
+ )->Tuple[DataFrame,Series]:
75
+ """
76
+ Text-Classification on the list of texts provided. Classifier provides the
77
+ most appropriate label for each text. these labels are in terms of if text
78
+ belongs to which particular Sustainable Devleopment Goal (SDG).
79
+
80
+ Params
81
+ ---------
82
+ haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
83
+ contains the list of paragraphs in different format,here the list of
84
+ Haystack Documents is used.
85
+ threshold: threshold value for the model to keep the results from classifier
86
+ classifiermodel: you can pass the classifier model directly,which takes priority
87
+ however if not then looks for model in streamlit session.
88
+ In case of streamlit avoid passing the model directly.
89
+
90
+
91
+ Returns
92
+ ----------
93
+ df: Dataframe with two columns['SDG:int', 'text']
94
+ x: Series object with the unique SDG covered in the document uploaded and
95
+ the number of times it is covered/discussed/count_of_paragraphs.
96
+
97
+ """
98
+ logging.info("Working on SDG Classification")
99
+ if not classifier_model:
100
+ if check_streamlit():
101
+ classifier_model = st.session_state['sdg_classifier']
102
+ else:
103
+ logging.warning("No streamlit envinornment found, Pass the classifier")
104
+ return
105
+
106
+ results = classifier_model.predict(haystack_doc)
107
+
108
+
109
+ labels_= [(l.meta['classification']['label'],
110
+ l.meta['classification']['score'],l.content,) for l in results]
111
+
112
+ df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
113
+
114
+ df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
115
+ df.index += 1
116
+ df =df[df['Relevancy']>threshold]
117
+
118
+ # creating the dataframe for value counts of SDG, along with 'title' of SDGs
119
+ x = df['SDG'].value_counts()
120
+ x = x.rename('count')
121
+ x = x.rename_axis('SDG').reset_index()
122
+ x["SDG"] = pd.to_numeric(x["SDG"])
123
+ x = x.sort_values(by=['count'], ascending=False)
124
+ x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
125
+ x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
126
+
127
+ df['SDG'] = pd.to_numeric(df['SDG'])
128
+ df = df.sort_values('SDG')
129
+
130
+ return df, x
131
+
132
+ def runSDGPreprocessingPipeline(file_name:str, file_path:str,
133
+ split_by: Literal["sentence", "word"] = 'sentence',
134
+ split_length:int = 2, split_respect_sentence_boundary:bool = False,
135
+ split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
136
+ """
137
+ creates the pipeline and runs the preprocessing pipeline,
138
+ the params for pipeline are fetched from paramconfig
139
+
140
+ Params
141
+ ------------
142
+
143
+ file_name: filename, in case of streamlit application use
144
+ st.session_state['filename']
145
+ file_path: filepath, in case of streamlit application use st.session_state['filepath']
146
+ split_by: document splitting strategy either as word or sentence
147
+ split_length: when synthetically creating the paragrpahs from document,
148
+ it defines the length of paragraph.
149
+ split_respect_sentence_boundary: Used when using 'word' strategy for
150
+ splititng of text.
151
+ split_overlap: Number of words or sentences that overlap when creating
152
+ the paragraphs. This is done as one sentence or 'some words' make sense
153
+ when read in together with others. Therefore the overlap is used.
154
+ remove_punc: to remove all Punctuation including ',' and '.' or not
155
+
156
+
157
+ Return
158
+ --------------
159
+ List[Document]: When preprocessing pipeline is run, the output dictionary
160
+ has four objects. For the Haysatck implementation of SDG classification we,
161
+ need to use the List of Haystack Document, which can be fetched by
162
+ key = 'documents' on output.
163
+
164
+ """
165
+
166
+ sdg_processing_pipeline = processingpipeline()
167
+
168
+ output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
169
+ params= {"FileConverter": {"file_path": file_path, \
170
+ "file_name": file_name},
171
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
172
+ "split_by": split_by, \
173
+ "split_length":split_length,\
174
+ "split_overlap": split_overlap, \
175
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
176
+
177
+ return output_sdg_pre
utils/semantic_search.py ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from haystack.nodes import TransformersQueryClassifier, Docs2Answers
2
+ from haystack.nodes import EmbeddingRetriever, FARMReader
3
+ from haystack.nodes.base import BaseComponent
4
+ from haystack.document_stores import InMemoryDocumentStore
5
+ from markdown import markdown
6
+ from annotated_text import annotation
7
+ from haystack.schema import Document
8
+ from typing import List, Text, Union
9
+ from typing_extensions import Literal
10
+ from utils.preprocessing import processingpipeline
11
+ from utils.streamlitcheck import check_streamlit
12
+ from haystack.pipelines import Pipeline
13
+ import pandas as pd
14
+ import logging
15
+ try:
16
+ from termcolor import colored
17
+ except:
18
+ pass
19
+ try:
20
+ import streamlit as st
21
+ except ImportError:
22
+ logging.info("Streamlit not installed")
23
+
24
+
25
+ @st.cache(allow_output_mutation=True)
26
+ def loadQueryClassifier():
27
+ """
28
+ retuns the haystack query classifier model
29
+ model = shahrukhx01/bert-mini-finetune-question-detection
30
+
31
+ """
32
+ query_classifier = TransformersQueryClassifier(model_name_or_path=
33
+ "shahrukhx01/bert-mini-finetune-question-detection")
34
+ return query_classifier
35
+
36
+ class QueryCheck(BaseComponent):
37
+ """
38
+ Uses Query Classifier from Haystack, process the query based on query type.
39
+ Ability to determine the statements is not so good, therefore the chances
40
+ statement also get modified. Ex: "List water related issues" will be
41
+ identified by the model as keywords, and therefore it be processed as "what
42
+ are the 'list all water related issues' related issues and discussions?".
43
+ This is one shortcoming but is igonred for now, as semantic search will not
44
+ get affected a lot, by this. If you want to pass keywords list and want to
45
+ do batch processing use. run_batch. Example: if you want to find relevant
46
+ passages for water, food security, poverty then querylist = ["water", "food
47
+ security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
48
+
49
+ 1. https://docs.haystack.deepset.ai/docs/query_classifier
50
+
51
+ """
52
+
53
+ outgoing_edges = 1
54
+
55
+ def run(self, query:str):
56
+ """
57
+ mandatory method to use the custom node. Determines the query type, if
58
+ if the query is of type keyword/statement will modify it to make it more
59
+ useful for sentence transoformers.
60
+
61
+ Params
62
+ --------
63
+ query: query/statement/keywords in form of string
64
+
65
+ Return
66
+ ------
67
+ output: dictionary, with key as identifier and value could be anything
68
+ we need to return. In this case the output contain key = 'query'.
69
+
70
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
71
+
72
+ """
73
+ query_classifier = loadQueryClassifier()
74
+ result = query_classifier.run(query=query)
75
+
76
+ if result[1] == "output_1":
77
+ output = {"query":query,
78
+ "query_type": 'question/statement'}
79
+ else:
80
+ output = {"query": "what are the {} related issues and \
81
+ discussions?".format(query),
82
+ "query_type": 'statements/keyword'}
83
+ logging.info(output)
84
+ return output, "output_1"
85
+
86
+ def run_batch(self, queries:List[str]):
87
+ """
88
+ running multiple queries in one go, howeevr need the queries to be passed
89
+ as list of string. Example: if you want to find relevant passages for
90
+ water, food security, poverty then querylist = ["water", "food security",
91
+ "poverty"] and then execute QueryCheck.run_batch(queries = querylist)
92
+
93
+ Params
94
+ --------
95
+ queries: queries/statements/keywords in form of string encapsulated
96
+ within List
97
+
98
+ Return
99
+ ------
100
+ output: dictionary, with key as identifier and value could be anything
101
+ we need to return. In this case the output contain key = 'queries'.
102
+
103
+ output_1: As there is only one outgoing edge, we pass 'output_1' string
104
+ """
105
+ query_classifier = loadQueryClassifier()
106
+ query_list = []
107
+ for query in queries:
108
+ result = query_classifier.run(query=query)
109
+ if result[1] == "output_1":
110
+ query_list.append(query)
111
+ else:
112
+ query_list.append("what are the {} related issues and \
113
+ discussions?".format(query))
114
+ output = {'queries':query_list}
115
+ logging.info(output)
116
+ return output, "output_1"
117
+
118
+
119
+ @st.cache(allow_output_mutation=True)
120
+ def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
121
+ split_by: Literal["sentence", "word"] = 'sentence',
122
+ split_length:int = 2, split_overlap:int = 0,
123
+ split_respect_sentence_boundary:bool = False,
124
+ remove_punc:bool = False)->List[Document]:
125
+ """
126
+ creates the pipeline and runs the preprocessing pipeline.
127
+
128
+ Params
129
+ ------------
130
+
131
+ file_name: filename, in case of streamlit application use
132
+ st.session_state['filename']
133
+ file_path: filepath, in case of streamlit application use
134
+ st.session_state['filepath']
135
+ split_by: document splitting strategy either as word or sentence
136
+ split_length: when synthetically creating the paragrpahs from document,
137
+ it defines the length of paragraph.
138
+ split_overlap: Number of words or sentences that overlap when creating the
139
+ paragraphs. This is done as one sentence or 'some words' make sense
140
+ when read in together with others. Therefore the overlap is used.
141
+ split_respect_sentence_boundary: Used when using 'word' strategy for
142
+ splititng of text.
143
+ remove_punc: to remove all Punctuation including ',' and '.' or not
144
+
145
+ Return
146
+ --------------
147
+ List[Document]: When preprocessing pipeline is run, the output dictionary
148
+ has four objects. For the Haysatck implementation of semantic search we,
149
+ need to use the List of Haystack Document, which can be fetched by
150
+ key = 'documents' on output.
151
+
152
+ """
153
+
154
+ semantic_processing_pipeline = processingpipeline()
155
+
156
+ output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
157
+ params= {"FileConverter": {"file_path": file_path, \
158
+ "file_name": file_name},
159
+ "UdfPreProcessor": {"remove_punc": remove_punc, \
160
+ "split_by": split_by, \
161
+ "split_length":split_length,\
162
+ "split_overlap": split_overlap,
163
+ "split_respect_sentence_boundary":split_respect_sentence_boundary}})
164
+
165
+ return output_semantic_pre
166
+
167
+
168
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
169
+ allow_output_mutation=True)
170
+ def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
171
+ embedding_layer:int = None, retriever_top_k:int = 10,
172
+ max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
173
+ """
174
+ Returns the Retriever model based on params provided.
175
+ 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
176
+ 2. https://www.sbert.net/examples/applications/semantic-search/README.html
177
+ 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
178
+
179
+
180
+ Params
181
+ ---------
182
+ embedding_model: Name of the model to be used for embedding. Check the links
183
+ provided in documentation
184
+ embedding_model_format: check the github link of Haystack provided in
185
+ documentation embedding_layer: check the github link of Haystack
186
+ provided in documentation retriever_top_k: Number of Top results to
187
+ be returned by
188
+ retriever max_seq_len: everymodel has max seq len it can handle, check in
189
+ model card. Needed to hanlde the edge cases.
190
+ document_store: InMemoryDocumentStore, write haystack Document list to
191
+ DocumentStore and pass the same to function call. Can be done using
192
+ createDocumentStore from utils.
193
+
194
+ Return
195
+ -------
196
+ retriever: embedding model
197
+ """
198
+ logging.info("loading retriever")
199
+ if document_store is None:
200
+ logging.warning("Retriever initialization requires the DocumentStore")
201
+ return
202
+
203
+ retriever = EmbeddingRetriever(
204
+ embedding_model=embedding_model,top_k = retriever_top_k,
205
+ document_store = document_store,
206
+ emb_extraction_layer=embedding_layer, scale_score =True,
207
+ model_format=embedding_model_format, use_gpu = True,
208
+ max_seq_len = max_seq_len )
209
+ if check_streamlit:
210
+ st.session_state['retriever'] = retriever
211
+ return retriever
212
+
213
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
214
+ allow_output_mutation=True)
215
+ def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
216
+ embedding_dim:int = 768):
217
+ """
218
+ Creates the InMemory Document Store from haystack list of Documents.
219
+ It is mandatory component for Retriever to work in Haystack frame work.
220
+
221
+ Params
222
+ -------
223
+ documents: List of haystack document. If using the preprocessing pipeline,
224
+ can be fetched key = 'documents; on output of preprocessing pipeline.
225
+ similarity: scoring function, can be either 'cosine' or 'dot_product'
226
+ embedding_dim: Document store has default value of embedding size = 768, and
227
+ update_embeddings method of Docstore cannot infer the embedding size of
228
+ retiever automatically, therefore set this value as per the model card.
229
+
230
+ Return
231
+ -------
232
+ document_store: InMemory Document Store object type.
233
+
234
+ """
235
+ document_store = InMemoryDocumentStore(similarity = similarity,
236
+ embedding_dim = embedding_dim )
237
+ document_store.write_documents(documents)
238
+
239
+ return document_store
240
+
241
+
242
+ @st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
243
+ allow_output_mutation=True)
244
+ def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
245
+ embedding_model_format:Text = None,embedding_layer:int = None,
246
+ embedding_dim:int = 768,retriever_top_k:int = 10,
247
+ reader_model:str = None, reader_top_k:int = 10,
248
+ max_seq_len:int =512,useQueryCheck = True,
249
+ top_k_per_candidate:int = 1):
250
+ """
251
+ creates the semantic search pipeline and document Store object from the
252
+ list of haystack documents. The top_k for the Reader and Retirever are kept
253
+ same, so that all the results returned by Retriever are used, however the
254
+ context is extracted by Reader for each retrieved result. The querycheck is
255
+ added as node to process the query. This pipeline is suited for keyword search,
256
+ and to some extent extractive QA purpose. The purpose of Reader is strictly to
257
+ highlight the context for retrieved result and not for QA, however as stated
258
+ it can work for QA too in limited sense.
259
+ There are 4 variants of pipeline it can return
260
+ 1.QueryCheck > Retriever > Reader
261
+ 2.Retriever > Reader
262
+ 3.QueryCheck > Retriever > Docs2Answers : If reader is None,
263
+ then Doc2answer is used to keep the output of pipeline structurally same.
264
+ 4.Retriever > Docs2Answers
265
+
266
+ Links
267
+
268
+ 1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
269
+ 2. https://www.sbert.net/examples/applications/semantic-search/README.html
270
+ 3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
271
+ 4. https://docs.haystack.deepset.ai/docs/reader
272
+
273
+
274
+ Params
275
+ ----------
276
+ documents: list of Haystack Documents, returned by preprocessig pipeline.
277
+ embedding_model: Name of the model to be used for embedding. Check the links
278
+ provided in documentation
279
+ embedding_model_format: check the github link of Haystack provided in
280
+ documentation
281
+ embedding_layer: check the github link of Haystack provided in documentation
282
+ embedding_dim: Document store has default value of embedding size = 768, and
283
+ update_embeddings method of Docstore cannot infer the embedding size of
284
+ retiever automatically, therefore set this value as per the model card.
285
+ retriever_top_k: Number of Top results to be returned by retriever
286
+ reader_model: Name of the model to be used for Reader node in hasyatck
287
+ Pipeline. Check the links provided in documentation
288
+ reader_top_k: Reader will use retrieved results to further find better matches.
289
+ As purpose here is to use reader to extract context, the value is
290
+ same as retriever_top_k.
291
+ max_seq_len:everymodel has max seq len it can handle, check in model card.
292
+ Needed to hanlde the edge cases
293
+ useQueryCheck: Whether to use the querycheck which modifies the query or not.
294
+ top_k_per_candidate:How many answers to extract for each candidate doc
295
+ that is coming from the retriever
296
+
297
+ Return
298
+ ---------
299
+ semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
300
+ nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
301
+ then Doc2answer is used to keep the output of pipeline structurally
302
+ same.
303
+
304
+ document_store: As retriever can work only with Haystack Document Store, the
305
+ list of document returned by preprocessing pipeline are fed into to
306
+ get InMemmoryDocumentStore object type, with retriever updating the
307
+ embeddings of each paragraph in document store.
308
+
309
+ """
310
+ document_store = createDocumentStore(documents=documents,
311
+ embedding_dim=embedding_dim)
312
+ retriever = loadRetriever(embedding_model = embedding_model,
313
+ embedding_model_format=embedding_model_format,
314
+ embedding_layer=embedding_layer,
315
+ retriever_top_k= retriever_top_k,
316
+ document_store = document_store,
317
+ max_seq_len=max_seq_len)
318
+ document_store.update_embeddings(retriever)
319
+ semantic_search_pipeline = Pipeline()
320
+ if useQueryCheck and reader_model:
321
+ querycheck = QueryCheck()
322
+ reader = FARMReader(model_name_or_path=reader_model,
323
+ top_k = reader_top_k, use_gpu=True,
324
+ top_k_per_candidate = top_k_per_candidate)
325
+ semantic_search_pipeline.add_node(component = querycheck,
326
+ name = "QueryCheck",inputs = ["Query"])
327
+ semantic_search_pipeline.add_node(component = retriever,
328
+ name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
329
+ semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
330
+ inputs= ["EmbeddingRetriever"])
331
+
332
+ elif reader_model :
333
+ reader = FARMReader(model_name_or_path=reader_model,
334
+ top_k = reader_top_k, use_gpu=True,
335
+ top_k_per_candidate = top_k_per_candidate)
336
+ semantic_search_pipeline.add_node(component = retriever,
337
+ name = "EmbeddingRetriever",inputs = ["Query"])
338
+ semantic_search_pipeline.add_node(component = reader,
339
+ name = "FARMReader",inputs= ["EmbeddingRetriever"])
340
+ elif useQueryCheck and not reader_model:
341
+ querycheck = QueryCheck()
342
+ docs2answers = Docs2Answers()
343
+ semantic_search_pipeline.add_node(component = querycheck,
344
+ name = "QueryCheck",inputs = ["Query"])
345
+ semantic_search_pipeline.add_node(component = retriever,
346
+ name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
347
+ semantic_search_pipeline.add_node(component = docs2answers,
348
+ name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
349
+ elif not useQueryCheck and not reader_model:
350
+ docs2answers = Docs2Answers()
351
+ semantic_search_pipeline.add_node(component = retriever,
352
+ name = "EmbeddingRetriever",inputs = ["Query"])
353
+ semantic_search_pipeline.add_node(component = docs2answers,
354
+ name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
355
+
356
+ logging.info(semantic_search_pipeline.components)
357
+ return semantic_search_pipeline, document_store
358
+
359
+ def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
360
+ """
361
+ will use the haystack run or run_batch based on if single query is passed
362
+ as string or multiple queries as List[str]
363
+
364
+ Params
365
+ -------
366
+ pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
367
+ from utils.semanticsearch
368
+
369
+ queries: Either a single query or list of queries.
370
+
371
+ Return
372
+ -------
373
+ results: Dict containing answers and documents as key and their respective
374
+ values
375
+
376
+ """
377
+
378
+ if type(queries) == list:
379
+ results = pipeline.run_batch(queries=queries)
380
+ elif type(queries) == str:
381
+ results = pipeline.run(query=queries)
382
+ else:
383
+ logging.info("Please check the input type for the queries")
384
+ return
385
+
386
+ return results
387
+
388
+ def process_query_output(results:dict)->pd.DataFrame:
389
+ """
390
+ Returns the dataframe with necessary information like including
391
+ ['query','answer','answer_offset','context_offset','context','content',
392
+ 'reader_score','retriever_score','id',]. This is designed for output given
393
+ by semantic search pipeline with single query and final node as reader.
394
+ The output of pipeline having Docs2Answers as final node or multiple queries
395
+ need to be handled separately. In these other cases, use process_semantic_output
396
+ from utils.semantic_search which uses this function internally to make one
397
+ combined dataframe.
398
+
399
+ Params
400
+ ---------
401
+ results: this dictionary should have key,values with
402
+ keys = [query,answers,documents], however answers is optional.
403
+ in case of [Doc2Answers as final node], process_semantic_output
404
+ doesnt return answers thereby setting all values contained in
405
+ answers to 'None'
406
+
407
+ Return
408
+ --------
409
+ df: dataframe with all the columns mentioned in function description.
410
+
411
+ """
412
+ query_text = results['query']
413
+ if 'answers' in results.keys():
414
+ answer_dict = {}
415
+
416
+ for answer in results['answers']:
417
+ answer_dict[answer.document_id] = answer.to_dict()
418
+ else:
419
+ answer_dict = {}
420
+ docs = results['documents']
421
+ df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
422
+ 'context','content','reader_score','retriever_score',
423
+ 'id'])
424
+ for doc in docs:
425
+ row_list = {}
426
+ row_list['query'] = query_text
427
+ row_list['retriever_score'] = doc.score
428
+ row_list['id'] = doc.id
429
+ row_list['content'] = doc.content
430
+ if doc.id in answer_dict.keys():
431
+ row_list['answer'] = answer_dict[doc.id]['answer']
432
+ row_list['context'] = answer_dict[doc.id]['context']
433
+ row_list['reader_score'] = answer_dict[doc.id]['score']
434
+ answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
435
+ row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
436
+ start_idx = doc.content.find(row_list['context'])
437
+ end_idx = start_idx + len(row_list['context'])
438
+ row_list['context_offset'] = [start_idx, end_idx]
439
+ else:
440
+ row_list['answer'] = None
441
+ row_list['context'] = None
442
+ row_list['reader_score'] = None
443
+ row_list['answer_offset'] = None
444
+ row_list['context_offset'] = None
445
+ df_dictionary = pd.DataFrame([row_list])
446
+ df = pd.concat([df, df_dictionary], ignore_index=True)
447
+
448
+ return df
449
+
450
+ def process_semantic_output(results):
451
+ """
452
+ Returns the dataframe with necessary information like including
453
+ ['query','answer','answer_offset','context_offset','context','content',
454
+ 'reader_score','retriever_score','id',]. Distingushes if its single query or
455
+ multi queries by reading the pipeline output dictionary keys.
456
+ Uses the process_query_output to get the dataframe for each query and create
457
+ one concataneted dataframe. In case of Docs2Answers as final node, deletes
458
+ the answers part. See documentations of process_query_output.
459
+
460
+ Params
461
+ ---------
462
+ results: raw output of runSemanticPipeline.
463
+
464
+ Return
465
+ --------
466
+ df: dataframe with all the columns mentioned in function description.
467
+
468
+ """
469
+ output = {}
470
+ if 'query' in results.keys():
471
+ output['query'] = results['query']
472
+ output['documents'] = results['documents']
473
+ if results['node_id'] == 'Docs2Answers':
474
+ pass
475
+ else:
476
+ output['answers'] = results['answers']
477
+ df = process_query_output(output)
478
+ return df
479
+ if 'queries' in results.keys():
480
+ df = pd.DataFrame(columns=['query','answer','answer_offset',
481
+ 'context_offset','context','content',
482
+ 'reader_score','retriever_score','id'])
483
+ for query,answers,documents in zip(results['queries'],
484
+ results['answers'],results['documents']):
485
+ output = {}
486
+ output['query'] = query
487
+ output['documents'] = documents
488
+ if results['node_id'] == 'Docs2Answers':
489
+ pass
490
+ else:
491
+ output['answers'] = answers
492
+
493
+ temp = process_query_output(output)
494
+ df = pd.concat([df, temp], ignore_index=True)
495
+
496
+
497
+ return df
498
+
499
+ def semanticsearchAnnotator(matches:List[List[int]], document:Text):
500
+ """
501
+ Annotates the text in the document defined by list of [start index, end index]
502
+ Example: "How are you today", if document type is text, matches = [[0,3]]
503
+ will give answer = "How", however in case we used the spacy matcher then the
504
+ matches = [[0,3]] will give answer = "How are you". However if spacy is used
505
+ to find "How" then the matches = [[0,1]] for the string defined above.
506
+
507
+ """
508
+ start = 0
509
+ annotated_text = ""
510
+ for match in matches:
511
+ start_idx = match[0]
512
+ end_idx = match[1]
513
+ if check_streamlit():
514
+ annotated_text = (annotated_text + document[start:start_idx]
515
+ + str(annotation(body=document[start_idx:end_idx],
516
+ label="Context", background="#964448", color='#ffffff')))
517
+ else:
518
+ annotated_text = (annotated_text + document[start:start_idx]
519
+ + colored(document[start_idx:end_idx],
520
+ "green", attrs = ['bold']))
521
+ start = end_idx
522
+
523
+ annotated_text = annotated_text + document[end_idx:]
524
+
525
+ if check_streamlit():
526
+
527
+ st.write(
528
+ markdown(annotated_text),
529
+ unsafe_allow_html=True,
530
+ )
531
+ else:
532
+ print(annotated_text)
533
+
534
+
535
+ def semantic_keywordsearch(query:Text,documents:List[Document],
536
+ embedding_model:Text,
537
+ embedding_model_format:Text,
538
+ embedding_layer:int, reader_model:str,
539
+ retriever_top_k:int = 10, reader_top_k:int = 10,
540
+ return_results:bool = False, embedding_dim:int = 768,
541
+ max_seq_len:int = 512,top_k_per_candidate:int =1,
542
+ sort_by:Literal["retriever", "reader"] = 'retriever'):
543
+ """
544
+ Performs the Semantic search on the List of haystack documents which is
545
+ returned by preprocessing Pipeline.
546
+
547
+ Params
548
+ -------
549
+ query: Keywords that need to be searche in documents.
550
+ documents: List fo Haystack documents returned by preprocessing pipeline.
551
+
552
+ """
553
+ semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
554
+ embedding_model= embedding_model,
555
+ embedding_layer= embedding_layer,
556
+ embedding_model_format= embedding_model_format,
557
+ reader_model= reader_model, retriever_top_k= retriever_top_k,
558
+ reader_top_k= reader_top_k, embedding_dim=embedding_dim,
559
+ max_seq_len=max_seq_len,
560
+ top_k_per_candidate=top_k_per_candidate)
561
+
562
+ raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
563
+ results_df = process_semantic_output(raw_output)
564
+ if sort_by == 'retriever':
565
+ results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
566
+ else:
567
+ results_df = results_df.sort_values(by=['reader_score'], ascending=False)
568
+
569
+ if return_results:
570
+ return results_df
571
+ else:
572
+ if check_streamlit:
573
+ st.markdown("##### Top few semantic search results #####")
574
+ else:
575
+ print("Top few semantic search results")
576
+ for i in range(len(results_df)):
577
+ if check_streamlit:
578
+ st.write("Result {}".format(i+1))
579
+ else:
580
+ print("Result {}".format(i+1))
581
+ semanticsearchAnnotator([results_df.loc[i]['context_offset']],
582
+ results_df.loc[i]['content'] )
utils/streamlitcheck.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ try:
3
+ import streamlit as st
4
+ except ImportError:
5
+ logging.info("Streamlit not installed")
6
+
7
+
8
+ def check_streamlit():
9
+ """
10
+ Function to check whether python code is run within streamlit
11
+
12
+ Returns
13
+ -------
14
+ use_streamlit : boolean
15
+ True if code is run within streamlit, else False
16
+ """
17
+ try:
18
+ from streamlit.scriptrunner.script_run_context import get_script_run_ctx
19
+ if not get_script_run_ctx():
20
+ use_streamlit = False
21
+ else:
22
+ use_streamlit = True
23
+ except ModuleNotFoundError:
24
+ use_streamlit = False
25
+ return use_streamlit
26
+
27
+ def disable_other_checkboxes(*other_checkboxes_keys):
28
+ for checkbox_key in other_checkboxes_keys:
29
+ st.session_state[checkbox_key] = False
30
+
31
+ def checkbox_without_preselect(keylist):
32
+ dict_ = {}
33
+ for i,key_val in enumerate(keylist):
34
+ dict_[i] = st.checkbox(key_val,key = key_val,
35
+ on_change = disable_other_checkboxes,
36
+ args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
37
+
38
+ for key,val in dict_.items():
39
+ if val == True:
40
+ return keylist[int(key)]
41
+
42
+ return None
utils/uploadAndExample.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import json
4
+
5
+ def add_upload(choice):
6
+ """
7
+ Provdies the user with choice to either 'Upload Document' or 'Try Example'.
8
+ Based on user choice runs streamlit processes and save the path and name of
9
+ the 'file' to streamlit session_state which then can be fetched later.
10
+
11
+ """
12
+
13
+ if choice == 'Upload Document':
14
+ uploaded_file = st.sidebar.file_uploader('Upload the File',
15
+ type=['pdf', 'docx', 'txt'])
16
+ if uploaded_file is not None:
17
+ with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
18
+ bytes_data = uploaded_file.getvalue()
19
+ temp.write(bytes_data)
20
+ st.session_state['filename'] = uploaded_file.name
21
+ st.session_state['filepath'] = temp.name
22
+
23
+
24
+ else:
25
+ # listing the options
26
+ with open('docStore/sample/files.json','r') as json_file:
27
+ files = json.load(json_file)
28
+
29
+ option = st.sidebar.selectbox('Select the example document',
30
+ list(files.keys()))
31
+ file_name = file_path = files[option]
32
+ st.session_state['filename'] = file_name
33
+ st.session_state['filepath'] = file_path
ver0.1 scripts/cleaning.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pandas as pd
3
+ import numpy as np
4
+ import string
5
+ import nltk
6
+ import spacy
7
+ import en_core_web_sm
8
+ import re
9
+ import streamlit as st
10
+
11
+ from haystack.nodes import PreProcessor
12
+
13
+ '''basic cleaning - suitable for transformer models'''
14
+ def basic(s,SDG = False):
15
+ """
16
+ :param s: string to be processed
17
+ :return: processed string: see comments in the source code for more info
18
+ """
19
+ # Text Lowercase
20
+ #s = s.lower()
21
+ # Remove punctuation
22
+ #translator = str.maketrans(' ', ' ', string.punctuation)
23
+ #s = s.translate(translator)
24
+ # Remove URLs
25
+ s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
26
+ s = re.sub(r"http\S+", " ", s)
27
+ if SDG == True:
28
+ s = s.lower()
29
+ translator = str.maketrans(' ', ' ', string.punctuation)
30
+ s = s.translate(translator)
31
+ s = re.sub('\n', ' ', s)
32
+ s = re.sub("\'", " ", s)
33
+ s = re.sub(r'\d+', ' ', s)
34
+ s = re.sub(r'\W+', ' ', s)
35
+
36
+ # Remove new line characters
37
+ #s = re.sub('\n', ' ', s)
38
+
39
+ # Remove distracting single quotes
40
+ #s = re.sub("\'", " ", s)
41
+ # Remove all remaining numbers and non alphanumeric characters
42
+ #s = re.sub(r'\d+', ' ', s)
43
+ #s = re.sub(r'\W+', ' ', s)
44
+
45
+ # define custom words to replace:
46
+ #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
47
+
48
+ return s.strip()
49
+
50
+
51
+ def preprocessingForSDG(document):
52
+
53
+ """
54
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
55
+
56
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
57
+ list that contains all text joined together.
58
+ """
59
+
60
+ preprocessor = PreProcessor(
61
+ clean_empty_lines=True,
62
+ clean_whitespace=True,
63
+ clean_header_footer=True,
64
+ split_by="word",
65
+ split_length=120,
66
+ split_respect_sentence_boundary=False,
67
+ #split_overlap=1
68
+ )
69
+ for i in document:
70
+ docs_processed = preprocessor.process([i])
71
+ for item in docs_processed:
72
+ item.content = basic(item.content, SDG = True)
73
+
74
+ with st.spinner("👑 document being splitted into paragraphs"):
75
+ logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
76
+
77
+ # create dataframe of text and list of all text
78
+ df = pd.DataFrame(docs_processed)
79
+ all_text = " ".join(df.content.to_list())
80
+ par_list = df.content.to_list()
81
+
82
+ return docs_processed, df, all_text, par_list
83
+
84
+ def preprocessing(document):
85
+
86
+ """
87
+ takes in haystack document object and splits it into paragraphs and applies simple cleaning.
88
+
89
+ Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
90
+ list that contains all text joined together.
91
+ """
92
+
93
+ preprocessor = PreProcessor(
94
+ clean_empty_lines=True,
95
+ clean_whitespace=True,
96
+ clean_header_footer=True,
97
+ split_by="sentence",
98
+ split_length=3,
99
+ split_respect_sentence_boundary=False,
100
+ split_overlap=1
101
+ )
102
+ for i in document:
103
+ docs_processed = preprocessor.process([i])
104
+ for item in docs_processed:
105
+ item.content = basic(item.content)
106
+
107
+ with st.spinner("👑 document being splitted into paragraphs"):
108
+ logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
109
+
110
+ # create dataframe of text and list of all text
111
+ df = pd.DataFrame(docs_processed)
112
+ all_text = " ".join(df.content.to_list())
113
+ par_list = df.content.to_list()
114
+
115
+ return docs_processed, df, all_text, par_list
116
+
117
+ '''processing with spacy - suitable for models such as tf-idf, word2vec'''
118
+ def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
119
+
120
+ """
121
+
122
+ Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
123
+
124
+ filters out all but proper nouns, nounts, verbs and adjectives.
125
+
126
+ Parameters
127
+ ----------
128
+ alpha : str
129
+
130
+ The input string.
131
+
132
+ use_nlp : bool, default False
133
+
134
+ Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
135
+
136
+ Should be set to False if used inside nlp.pipeline
137
+
138
+ Returns
139
+ -------
140
+ ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
141
+
142
+ Notes
143
+ -----
144
+ Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
145
+ Use together with nlp.pipeline for batch processing.
146
+
147
+ """
148
+
149
+ nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
150
+
151
+ if use_nlp:
152
+
153
+ alpha = nlp(alpha)
154
+
155
+
156
+
157
+ beta = []
158
+
159
+ for tok in alpha:
160
+
161
+ if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
162
+
163
+ beta.append(tok.lemma_)
164
+
165
+
166
+ text = ' '.join(beta)
167
+ text = text.lower()
168
+ return text
ver0.1 scripts/coherence.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys; sys.path.append('../udfPreprocess')
3
+
4
+ #import helper
5
+ import udfPreprocess.docPreprocessing as pre
6
+ import udfPreprocess.cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ # from keybert import KeyBERT
14
+ from transformers import pipeline
15
+ import matplotlib.pyplot as plt
16
+ import numpy as np
17
+ import streamlit as st
18
+ import pandas as pd
19
+ from rank_bm25 import BM25Okapi
20
+ from sklearn.feature_extraction import _stop_words
21
+ import string
22
+ from tqdm.autonotebook import tqdm
23
+ import numpy as np
24
+ import urllib.request
25
+ import ast
26
+ import tempfile
27
+ import sqlite3
28
+ import json
29
+ import urllib.request
30
+ import ast
31
+ import docx
32
+ from docx.shared import Inches
33
+ from docx.shared import Pt
34
+ from docx.enum.style import WD_STYLE_TYPE
35
+
36
+ def app():
37
+ # Sidebar
38
+ st.sidebar.title('Check Coherence')
39
+ st.sidebar.write(' ')
40
+ with open('ndcs/countryList.txt') as dfile:
41
+ countryList = dfile.read()
42
+
43
+ countryList = ast.literal_eval(countryList)
44
+ countrynames = list(countryList.keys())
45
+
46
+ option = st.sidebar.selectbox('Select Country', (countrynames))
47
+ countryCode = countryList[option]
48
+
49
+
50
+ with st.container():
51
+ st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
52
+ st.write(' ')
53
+ st.write(' ')
54
+
55
+ with st.expander("ℹ️ - About this app", expanded=True):
56
+
57
+ st.write(
58
+ """
59
+ The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
60
+ """
61
+ )
62
+
63
+ st.markdown("")
64
+
65
+ st.markdown("")
66
+ st.markdown("## 📌 Step One: Upload document of the country selected ")
67
+
68
+ with st.container():
69
+ docs = None
70
+ # asking user for either upload or select existing doc
71
+ choice = st.radio(label = 'Select the Document',
72
+ help = 'You can upload the document \
73
+ or else you can try a example document.',
74
+ options = ('Upload Document', 'Try Example'),
75
+ horizontal = True)
76
+
77
+ if choice == 'Upload Document':
78
+ uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
79
+ if uploaded_file is not None:
80
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
81
+ bytes_data = uploaded_file.getvalue()
82
+ temp.write(bytes_data)
83
+
84
+ st.write("Uploaded Filename: ", uploaded_file.name)
85
+ file_name = uploaded_file.name
86
+ file_path = temp.name
87
+ docs = pre.load_document(file_path, file_name)
88
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
89
+
90
+ else:
91
+ # listing the options
92
+ option = st.selectbox('Select the example document',
93
+ ('South Africa:Low Emission strategy',
94
+ 'Ethiopia: 10 Year Development Plan'))
95
+ if option is 'South Africa:Low Emission strategy':
96
+ file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
97
+ countryCode = countryList['South Africa']
98
+ st.write("Selected document:", file_name.split('/')[1])
99
+ # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
100
+ # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
101
+ else:
102
+ # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
103
+ file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
104
+ countryCode = countryList['Ethiopia']
105
+ st.write("Selected document:", file_name.split('/')[1])
106
+
107
+ if option is not None:
108
+ docs = pre.load_document(file_path,file_name)
109
+ haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
110
+
111
+ with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
112
+ cca_sent = dfile.read()
113
+
114
+ cca_sent = ast.literal_eval(cca_sent)
115
+
116
+ with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
117
+ ccm_sent = dfile.read()
118
+
119
+ ccm_sent = ast.literal_eval(ccm_sent)
120
+
121
+ with open('ndcs/countryList.txt') as dfile:
122
+ countryList = dfile.read()
123
+
124
+ countryList = ast.literal_eval(countryList)
125
+
126
+ def get_document(countryCode: str):
127
+ link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
128
+ with urllib.request.urlopen(link) as urlfile:
129
+ data = json.loads(urlfile.read())
130
+ categoriesData = {}
131
+ categoriesData['categories']= data['categories']
132
+ categoriesData['subcategories']= data['subcategories']
133
+ keys_sub = categoriesData['subcategories'].keys()
134
+ documentType= 'NDCs'
135
+ if documentType in data.keys():
136
+ if countryCode in data[documentType].keys():
137
+ get_dict = {}
138
+ for key, value in data[documentType][countryCode].items():
139
+ if key not in ['country_name','region_id', 'region_name']:
140
+ get_dict[key] = value['classification']
141
+ else:
142
+ get_dict[key] = value
143
+ else:
144
+ return None
145
+ else:
146
+ return None
147
+
148
+ country = {}
149
+ for key in categoriesData['categories']:
150
+ country[key]= {}
151
+ for key,value in categoriesData['subcategories'].items():
152
+ country[value['category']][key] = get_dict[key]
153
+
154
+ return country
155
+
156
+ # country_ndc = get_document('NDCs', countryList[option])
157
+
158
+ def countrySpecificCCA(cca_sent, threshold, countryCode):
159
+ temp = {}
160
+ doc = get_document(countryCode)
161
+ for key,value in cca_sent.items():
162
+ id_ = doc['climate change adaptation'][key]['id']
163
+ if id_ >threshold:
164
+ temp[key] = value['id'][id_]
165
+ return temp
166
+
167
+
168
+ def countrySpecificCCM(ccm_sent, threshold, countryCode):
169
+ temp = {}
170
+ doc = get_document(countryCode)
171
+ for key,value in ccm_sent.items():
172
+ id_ = doc['climate change mitigation'][key]['id']
173
+ if id_ >threshold:
174
+ temp[key] = value['id'][id_]
175
+
176
+ return temp
177
+
178
+
179
+
180
+ if docs is not None:
181
+ sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
182
+ sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
183
+ #st.write(sent_ccm)
184
+ @st.cache(allow_output_mutation=True)
185
+ def load_sentenceTransformer(name):
186
+ return SentenceTransformer(name)
187
+ model = load_sentenceTransformer('all-MiniLM-L6-v2')
188
+
189
+ document_embeddings = model.encode(paraList, show_progress_bar=True)
190
+
191
+ genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
192
+ if genre == 'Climate Change Adaptation':
193
+ sent_dict = sent_cca
194
+ sent_labels = []
195
+ for key,sent in sent_dict.items():
196
+ sent_labels.append(sent)
197
+ label_embeddings = model.encode(sent_labels, show_progress_bar=True)
198
+ similarity_high_threshold = 0.55
199
+ similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
200
+ label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
201
+
202
+ positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
203
+
204
+
205
+ else:
206
+ sent_dict = sent_ccm
207
+ sent_labels = []
208
+ for key,sent in sent_dict.items():
209
+ sent_labels.append(sent)
210
+ label_embeddings = model.encode(sent_labels, show_progress_bar=True)
211
+ similarity_high_threshold = 0.55
212
+ similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
213
+ label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
214
+
215
+ positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
216
+
217
+
218
+ # sent_labels = []
219
+ # for key,sent in sent_dict.items():
220
+ # sent_labels.append(sent)
221
+
222
+
223
+ # label_embeddings = model.encode(sent_labels, show_progress_bar=True)
224
+
225
+ #similarity_high_threshold = 0.55
226
+ # similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
227
+ #label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
228
+
229
+ #positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
230
+ document = docx.Document()
231
+ document.add_heading('Document name:{}'.format(file_name), 2)
232
+ section = document.sections[0]
233
+
234
+ # Calling the footer
235
+ footer = section.footer
236
+
237
+ # Calling the paragraph already present in
238
+ # the footer section
239
+ footer_para = footer.paragraphs[0]
240
+
241
+ font_styles = document.styles
242
+ font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
243
+ font_object = font_charstyle.font
244
+ font_object.size = Pt(7)
245
+ # Adding the centered zoned footer
246
+ footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
247
+
248
+ document.add_paragraph("Country Code for which NDC is carried out {}".format(countryCode))
249
+
250
+ for _label_idx, _paragraph_idx in positive_indices:
251
+ st.write("This paragraph: \n")
252
+ document.add_paragraph("This paragraph: \n")
253
+ st.write(paraList[_paragraph_idx])
254
+ st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
255
+ document.add_paragraph(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
256
+ st.write('-'*10)
257
+ document.add_paragraph('-'*10)
258
+
259
+ document.save('demo.docx')
260
+ with open("demo.docx", "rb") as file:
261
+ btn = st.download_button(
262
+ label="Download file",
263
+ data=file,
264
+ file_name="demo.docx",
265
+ mime="txt/docx"
266
+ )
267
+
ver0.1 scripts/docPreprocessing.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, List, Optional
2
+
3
+ from pathlib import Path
4
+ import re
5
+ import logging
6
+ import string
7
+ import streamlit as st
8
+ logger = logging.getLogger(__name__)
9
+
10
+ import os
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+
13
+ from haystack.utils import convert_files_to_docs, fetch_archive_from_http
14
+ from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter
15
+ from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
16
+ from haystack.schema import Document
17
+ import pdfplumber
18
+
19
+ import pandas as pd
20
+
21
+ import tempfile
22
+ import sqlite3
23
+
24
+
25
+
26
+ def load_document(
27
+ file_path: str,
28
+ file_name,
29
+ encoding: Optional[str] = None,
30
+ id_hash_keys: Optional[List[str]] = None,
31
+ ) -> List[Document]:
32
+
33
+ """
34
+ takes docx, txt and pdf files as input and \
35
+ extracts text as well as the filename as metadata. \
36
+ Since haystack does not take care of all pdf files, \
37
+ pdfplumber is attached to the pipeline in case the pdf \
38
+ extraction fails via Haystack.
39
+
40
+ Returns a list of type haystack.schema.Document
41
+ """
42
+
43
+ if file_name.endswith('.pdf'):
44
+ converter = PDFToTextConverter(remove_numeric_tables=True)
45
+ if file_name.endswith('.txt'):
46
+ converter = TextConverter()
47
+ if file_name.endswith('.docx'):
48
+ converter = DocxToTextConverter()
49
+
50
+
51
+ documents = []
52
+ logger.info("Converting {}".format(file_name))
53
+ # PDFToTextConverter, TextConverter, and DocxToTextConverter
54
+ # return a list containing a single Document
55
+ document = converter.convert(
56
+ file_path=file_path, meta=None,
57
+ encoding=encoding, id_hash_keys=id_hash_keys
58
+ )[0]
59
+ text = document.content
60
+ documents.append(Document(content=text,
61
+ meta={"name": file_name},
62
+ id_hash_keys=id_hash_keys))
63
+
64
+ '''check if text is empty and apply different pdf processor. \
65
+ This can happen whith certain pdf types.'''
66
+ for i in documents:
67
+ if i.content == "":
68
+ with st.spinner("using pdfplumber"):
69
+ text = []
70
+ with pdfplumber.open(file_path) as pdf:
71
+ for page in pdf.pages:
72
+ text.append(page.extract_text())
73
+ i.content = ' '.join([page for page in text])
74
+
75
+ return documents
ver0.1 scripts/keyword_search.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys
3
+ from udfPreprocess.search import semantic_search
4
+ sys.path.append('../udfPreprocess')
5
+
6
+ #import helper
7
+ import udfPreprocess.docPreprocessing as pre
8
+ import udfPreprocess.cleaning as clean
9
+ from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
10
+ #import needed libraries
11
+ import seaborn as sns
12
+ from pandas import DataFrame
13
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
14
+ # from keybert import KeyBERT
15
+ from transformers import pipeline
16
+ import matplotlib.pyplot as plt
17
+ import numpy as np
18
+ import streamlit as st
19
+ import pandas as pd
20
+ from rank_bm25 import BM25Okapi
21
+ from sklearn.feature_extraction import _stop_words
22
+ import string
23
+ from tqdm.autonotebook import tqdm
24
+ import numpy as np
25
+ import docx
26
+ from docx.shared import Inches
27
+ from docx.shared import Pt
28
+ from docx.enum.style import WD_STYLE_TYPE
29
+ import logging
30
+ logger = logging.getLogger(__name__)
31
+ import tempfile
32
+ import sqlite3
33
+ import json
34
+ import configparser
35
+
36
+
37
+ def app():
38
+
39
+ with st.container():
40
+ st.markdown("<h1 style='text-align: center; \
41
+ color: black;'> Search</h1>",
42
+ unsafe_allow_html=True)
43
+ st.write(' ')
44
+ st.write(' ')
45
+
46
+ with st.expander("ℹ️ - About this app", expanded=False):
47
+
48
+ st.write(
49
+ """
50
+ The *Keyword Search* app is an easy-to-use interface \
51
+ built in Streamlit for doing keyword search in \
52
+ policy document - developed by GIZ Data and the \
53
+ Sustainable Development Solution Network.
54
+ """)
55
+
56
+ st.markdown("")
57
+
58
+
59
+
60
+ with st.sidebar:
61
+ with open('sample/keywordexample.json','r') as json_file:
62
+ keywordexample = json.load(json_file)
63
+
64
+ genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
65
+ if genre == 'Food':
66
+ keywordList = keywordexample['Food']
67
+ elif genre == 'Climate':
68
+ keywordList = keywordexample['Climate']
69
+ elif genre == 'Social':
70
+ keywordList = keywordexample['Social']
71
+ elif genre == 'Nature':
72
+ keywordList = keywordexample['Nature']
73
+ elif genre == 'Implementation':
74
+ keywordList = keywordexample['Implementation']
75
+ else:
76
+ keywordList = None
77
+
78
+ searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
79
+
80
+
81
+ with st.container():
82
+ if keywordList is not None:
83
+ queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
84
+ value="{}".format(keywordList))
85
+ else:
86
+ queryList = st.text_input("Please enter here your question and we will look \
87
+ for an answer in the document OR enter the keyword you \
88
+ are looking for and we will \
89
+ we will look for similar context \
90
+ in the document.",
91
+ placeholder="Enter keyword here")
92
+
93
+ if st.button("Find them"):
94
+
95
+ if queryList == "":
96
+ st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
97
+ logging.warning("Terminated as no keyword provided")
98
+ else:
99
+
100
+ if 'docs' in st.session_state:
101
+ docs = st.session_state['docs']
102
+ paraList = st.session_state['paraList']
103
+
104
+ if searchtype == 'Exact Matches':
105
+ queryList = list(queryList.split(","))
106
+ logging.info("performing lexical search")
107
+ tokenized_corpus = bm25TokenizeDoc(paraList)
108
+ # st.write(len(tokenized_corpus))
109
+ document_bm25 = BM25Okapi(tokenized_corpus)
110
+
111
+ with st.spinner("Performing Exact matching search (Lexical search) for you"):
112
+ st.markdown("##### Top few lexical search (BM25) hits #####")
113
+
114
+ for keyword in queryList:
115
+
116
+ bm25_hits = lexical_search(keyword,document_bm25)
117
+
118
+
119
+ counter = 0
120
+ for hit in bm25_hits:
121
+ if hit['score'] > 0.00:
122
+ counter += 1
123
+ if counter == 1:
124
+ st.markdown("###### Results for keyword: **{}** ######".format(keyword))
125
+ # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
126
+ st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
127
+
128
+
129
+ if counter == 0:
130
+ st.write("No results found for '**{}**' ".format(keyword))
131
+
132
+ st.markdown("---")
133
+ else:
134
+ logging.info("starting semantic search")
135
+ with st.spinner("Performing Similar/Contextual search"):
136
+ query = "Find {} related issues ?".format(queryList)
137
+ config = configparser.ConfigParser()
138
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
139
+ threshold = float(config.get('semantic_search','THRESHOLD'))
140
+ # st.write(query)
141
+ semantic_hits = semantic_search(query,paraList)
142
+ st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList))
143
+
144
+ for i,queryhit in enumerate(semantic_hits):
145
+
146
+ # st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
147
+ counter = 0
148
+ for hit in queryhit:
149
+ counter += 1
150
+
151
+
152
+ if hit['score'] > threshold:
153
+ # st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
154
+ st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
155
+
156
+ # document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
157
+ st.markdown("---")
158
+ # st.write(semantic_hits)
159
+
160
+
161
+
162
+
163
+ else:
164
+ st.info("🤔 No document found, please try to upload it at the sidebar!")
165
+ logging.warning("Terminated as no keyword provided")
166
+
167
+
168
+
169
+
ver0.1 scripts/sdg.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob, os, sys;
2
+ sys.path.append('../udfPreprocess')
3
+
4
+ #import helper
5
+ import udfPreprocess.docPreprocessing as pre
6
+ import udfPreprocess.cleaning as clean
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ from pandas import DataFrame
11
+ from keybert import KeyBERT
12
+ from transformers import pipeline
13
+ import matplotlib.pyplot as plt
14
+ import numpy as np
15
+ import streamlit as st
16
+ import pandas as pd
17
+ import docx
18
+ from docx.shared import Inches
19
+ from docx.shared import Pt
20
+ from docx.enum.style import WD_STYLE_TYPE
21
+
22
+ import tempfile
23
+ import sqlite3
24
+ import logging
25
+ logger = logging.getLogger(__name__)
26
+ import configparser
27
+
28
+ @st.cache(allow_output_mutation=True)
29
+ def load_sdgClassifier():
30
+ classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
31
+ logging.info("Loading classifier")
32
+ return classifier
33
+
34
+ def sdg_classification(par_list):
35
+ logging.info("running SDG classifiication")
36
+ config = configparser.ConfigParser()
37
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
38
+ threshold = float(config.get('sdg','THRESHOLD'))
39
+
40
+
41
+ classifier = load_sdgClassifier()
42
+ labels = classifier(par_list)
43
+
44
+ labels_= [(l['label'],l['score']) for l in labels]
45
+ # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
46
+ df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
47
+
48
+ df2['text'] = par_list
49
+ df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
50
+ df2.index += 1
51
+ df2 =df2[df2['Relevancy']>threshold]
52
+ x = df2['SDG'].value_counts()
53
+ df3 = df2.copy()
54
+ df3= df3.drop(['Relevancy'], axis = 1)
55
+
56
+
57
+ return df3, x
ver0.1 scripts/sdg_analysis.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set path
2
+ import glob, os, sys;
3
+ sys.path.append('../udfPreprocess')
4
+
5
+ #import helper
6
+
7
+
8
+ #import needed libraries
9
+ import seaborn as sns
10
+ import matplotlib.pyplot as plt
11
+ import numpy as np
12
+ import streamlit as st
13
+ import docx
14
+ from docx.shared import Inches
15
+ from docx.shared import Pt
16
+ from docx.enum.style import WD_STYLE_TYPE
17
+ from udfPreprocess.sdg_classifier import sdg_classification
18
+ from udfPreprocess.sdg_classifier import runSDGPreprocessingPipeline
19
+ import configparser
20
+ import tempfile
21
+ import sqlite3
22
+ import logging
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+
27
+ def app():
28
+
29
+ with st.container():
30
+ st.markdown("<h1 style='text-align: center; color: black;'> SDSN x GIZ Policy Action Tracking v0.1</h1>", unsafe_allow_html=True)
31
+ st.write(' ')
32
+ st.write(' ')
33
+
34
+ with st.expander("ℹ️ - About this app", expanded=False):
35
+
36
+ st.write(
37
+ """
38
+ The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents with respect to SDG Classification for the paragraphs/texts in the document - developed by GIZ Data and the Sustainable Development Solution Network. \n
39
+ """)
40
+ st.markdown("")
41
+
42
+
43
+ with st.container():
44
+
45
+
46
+
47
+ if 'filepath' in st.session_state:
48
+ paraList = runSDGPreprocessingPipeline()
49
+ with st.spinner("Running SDG"):
50
+
51
+ df, x = sdg_classification(paraList)
52
+
53
+
54
+ # classifier = load_sdgClassifier()
55
+
56
+ # labels = classifier(par_list)
57
+ # labels_= [(l['label'],l['score']) for l in labels]
58
+ # df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
59
+ # df2['text'] = par_list
60
+ # df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
61
+ # df2.index += 1
62
+ # df2 =df2[df2['Relevancy']>.85]
63
+ # x = df2['SDG'].value_counts()
64
+ # df3 = df2.copy()
65
+
66
+ plt.rcParams['font.size'] = 25
67
+ colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
68
+ # plot
69
+ fig, ax = plt.subplots()
70
+ ax.pie(x, colors=colors, radius=2, center=(4, 4),
71
+ wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
72
+ # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
73
+ st.markdown("#### Anything related to SDGs? ####")
74
+
75
+ # st.markdown("#### 🎈 Anything related to SDGs? ####")
76
+
77
+ c4, c5, c6 = st.columns([2, 2, 2])
78
+
79
+ # Add styling
80
+ cmGreen = sns.light_palette("green", as_cmap=True)
81
+ cmRed = sns.light_palette("red", as_cmap=True)
82
+ # df2 = df2.style.background_gradient(
83
+ # cmap=cmGreen,
84
+ # subset=[
85
+ # "Relevancy",
86
+ # ],
87
+ # )
88
+
89
+ # format_dictionary = {
90
+ # "Relevancy": "{:.1%}",
91
+ # }
92
+
93
+ # df2 = df2.format(format_dictionary)
94
+
95
+ with c5:
96
+ st.pyplot(fig)
97
+
98
+ c7, c8, c9 = st.columns([1, 10, 1])
99
+ with c8:
100
+ st.table(df)
101
+
102
+
103
+ # 1. Keyword heatmap \n
104
+ # 2. SDG Classification for the paragraphs/texts in the document
105
+ #
106
+
107
+ # with st.container():
108
+ # if 'docs' in st.session_state:
109
+ # docs = st.session_state['docs']
110
+ # docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
111
+ # # paraList = st.session_state['paraList']
112
+ # logging.info("keybert")
113
+ # with st.spinner("Running Key bert"):
114
+
115
+ # kw_model = load_keyBert()
116
+
117
+ # keywords = kw_model.extract_keywords(
118
+ # all_text,
119
+ # keyphrase_ngram_range=(1, 3),
120
+ # use_mmr=True,
121
+ # stop_words="english",
122
+ # top_n=10,
123
+ # diversity=0.7,
124
+ # )
125
+
126
+ # st.markdown("## 🎈 What is my document about?")
127
+
128
+ # df = (
129
+ # DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
130
+ # .sort_values(by="Relevancy", ascending=False)
131
+ # .reset_index(drop=True)
132
+ # )
133
+ # df1 = (
134
+ # DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
135
+ # .sort_values(by="Relevancy", ascending=False)
136
+ # .reset_index(drop=True)
137
+ # )
138
+ # df.index += 1
139
+
140
+ # # Add styling
141
+ # cmGreen = sns.light_palette("green", as_cmap=True)
142
+ # cmRed = sns.light_palette("red", as_cmap=True)
143
+ # df = df.style.background_gradient(
144
+ # cmap=cmGreen,
145
+ # subset=[
146
+ # "Relevancy",
147
+ # ],
148
+ # )
149
+
150
+ # c1, c2, c3 = st.columns([1, 3, 1])
151
+
152
+ # format_dictionary = {
153
+ # "Relevancy": "{:.1%}",
154
+ # }
155
+
156
+ # df = df.format(format_dictionary)
157
+
158
+ # with c2:
159
+ #
160
+ # st.table(df)
ver0.1 scripts/search.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob, os, sys; sys.path.append('../utils')
2
+
3
+ #import needed libraries
4
+ import seaborn as sns
5
+ from pandas import DataFrame
6
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
7
+ # from keybert import KeyBERT
8
+ from transformers import pipeline
9
+ import matplotlib.pyplot as plt
10
+ import numpy as np
11
+ import streamlit as st
12
+ import pandas as pd
13
+ from rank_bm25 import BM25Okapi
14
+ from sklearn.feature_extraction import _stop_words
15
+ import string
16
+ from tqdm.autonotebook import tqdm
17
+ import numpy as np
18
+ import docx
19
+ from docx.shared import Inches
20
+ from docx.shared import Pt
21
+ from docx.enum.style import WD_STYLE_TYPE
22
+ import logging
23
+ logger = logging.getLogger(__name__)
24
+ import tempfile
25
+ import sqlite3
26
+ import configparser
27
+
28
+ ### These are lexcial search related functions #####
29
+
30
+ def bm25_tokenizer(text):
31
+ tokenized_doc = []
32
+ for token in text.lower().split():
33
+ token = token.strip(string.punctuation)
34
+
35
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
36
+ tokenized_doc.append(token)
37
+ return tokenized_doc
38
+
39
+ def bm25TokenizeDoc(paraList):
40
+ tokenized_corpus = []
41
+ ##########Commenting this for now########### will incorporate paragrpah splitting later.
42
+ # for passage in tqdm(paraList):
43
+ # if len(passage.split()) >256:
44
+ # # st.write("Splitting")
45
+ # temp = " ".join(passage.split()[:256])
46
+ # tokenized_corpus.append(bm25_tokenizer(temp))
47
+ # temp = " ".join(passage.split()[256:])
48
+ # tokenized_corpus.append(bm25_tokenizer(temp))
49
+ # else:
50
+ # tokenized_corpus.append(bm25_tokenizer(passage))
51
+ ######################################################################################33333
52
+ for passage in tqdm(paraList):
53
+ tokenized_corpus.append(bm25_tokenizer(passage))
54
+
55
+ return tokenized_corpus
56
+
57
+ def lexical_search(keyword, document_bm25):
58
+ config = configparser.ConfigParser()
59
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
60
+ top_k = int(config.get('lexical_search','TOP_K'))
61
+ bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
62
+ top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
63
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
64
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
65
+ return bm25_hits
66
+
67
+ @st.cache(allow_output_mutation=True)
68
+ def load_sentenceTransformer(name):
69
+ return SentenceTransformer(name)
70
+
71
+
72
+ def semantic_search(keywordlist,paraList):
73
+
74
+ ##### Sematic Search #####
75
+ #query = "Does document contain {} issues ?".format(keyword)
76
+ config = configparser.ConfigParser()
77
+ config.read_file(open('udfPreprocess/paramconfig.cfg'))
78
+ model_name = config.get('semantic_search','MODEL_NAME')
79
+
80
+ bi_encoder = load_sentenceTransformer(model_name)
81
+ bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
82
+ top_k = int(config.get('semantic_search','TOP_K'))
83
+ document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
84
+ question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
85
+
86
+ hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
87
+
88
+ return hits
89
+
90
+ def show_results(keywordList):
91
+ document = docx.Document()
92
+ # document.add_heading('Document name:{}'.format(file_name), 2)
93
+ section = document.sections[0]
94
+
95
+ # Calling the footer
96
+ footer = section.footer
97
+
98
+ # Calling the paragraph already present in
99
+ # the footer section
100
+ footer_para = footer.paragraphs[0]
101
+
102
+ font_styles = document.styles
103
+ font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
104
+ font_object = font_charstyle.font
105
+ font_object.size = Pt(7)
106
+ # Adding the centered zoned footer
107
+ footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
108
+ document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
109
+ for keyword in keywordList:
110
+
111
+ st.write("Results for Query: {}".format(keyword))
112
+ para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
113
+ para.font.size = Pt(12)
114
+ bm25_hits, hits = search(keyword)
115
+
116
+ st.markdown("""
117
+ We will provide with 2 kind of results. The 'lexical search' and the semantic search.
118
+ """)
119
+ # In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
120
+ st.markdown("Top few lexical search (BM25) hits")
121
+ document.add_paragraph("Top few lexical search (BM25) hits")
122
+
123
+ for hit in bm25_hits[0:5]:
124
+ if hit['score'] > 0.00:
125
+ st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
126
+ document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
127
+
128
+
129
+
130
+ # st.table(bm25_hits[0:3])
131
+
132
+ st.markdown("\n-------------------------\n")
133
+ st.markdown("Top few Bi-Encoder Retrieval hits")
134
+ document.add_paragraph("\n-------------------------\n")
135
+ document.add_paragraph("Top few Bi-Encoder Retrieval hits")
136
+
137
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
138
+ for hit in hits[0:5]:
139
+ # if hit['score'] > 0.45:
140
+ st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
141
+ document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
ver0.1 scripts/uploadAndExample.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ import udfPreprocess.docPreprocessing as pre
4
+ import udfPreprocess.cleaning as clean
5
+
6
+ def add_upload(choice):
7
+
8
+
9
+ if choice == 'Upload Document':
10
+ uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
11
+ if uploaded_file is not None:
12
+ with tempfile.NamedTemporaryFile(mode="wb") as temp:
13
+ bytes_data = uploaded_file.getvalue()
14
+ temp.write(bytes_data)
15
+ st.session_state['filename'] = uploaded_file.name
16
+ # st.write("Uploaded Filename: ", uploaded_file.name)
17
+ file_name = uploaded_file.name
18
+ file_path = temp.name
19
+ # docs = pre.load_document(file_path, file_name)
20
+ # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
21
+ st.session_state['filename'] = file_name
22
+ # st.session_state['paraList'] = paraList
23
+ st.session_state['filepath'] = file_path
24
+
25
+
26
+
27
+ else:
28
+ # listing the options
29
+ option = st.sidebar.selectbox('Select the example document',
30
+ ('South Africa:Low Emission strategy',
31
+ 'Ethiopia: 10 Year Development Plan'))
32
+ if option is 'South Africa:Low Emission strategy':
33
+ file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
34
+ st.session_state['filename'] = file_name
35
+ st.sesion_state['filepath'] = file_path
36
+ # st.write("Selected document:", file_name.split('/')[1])
37
+ # with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
38
+ # file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
39
+ else:
40
+ # with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
41
+ file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
42
+ st.session_state['filename'] = file_name
43
+ st.session_state['filepath'] = file_path
44
+ # st.write("Selected document:", file_name.split('/')[1])
45
+
46
+ # if option is not None:
47
+ # docs = pre.load_document(file_path,file_name)
48
+ # haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
49
+ # st.session_state['docs'] = docs
50
+ # st.session_state['paraList'] = paraList
51
+
52
+