Spaces:
Runtime error
Runtime error
Commit
·
e1b1d60
0
Parent(s):
Duplicate from GIZ/SDSN-demo
Browse filesCo-authored-by: Prashant Singh <[email protected]>
- .DS_Store +0 -0
- .gitattributes +34 -0
- Policy-Action-Tracker_Concept-Note.pdf +0 -0
- README.md +13 -0
- app.py +18 -0
- appStore/__init__.py +1 -0
- appStore/coherence.py +156 -0
- appStore/info.py +72 -0
- appStore/keyword_search.py +176 -0
- appStore/multiapp.py +70 -0
- appStore/sdg_analysis.py +179 -0
- docStore/img/giz_sdsn_small.jpg +0 -0
- docStore/img/ndc.png +0 -0
- docStore/img/paris.png +0 -0
- docStore/img/sdsn.png +0 -0
- docStore/ndcs/cca.txt +81 -0
- docStore/ndcs/ccm.txt +86 -0
- docStore/ndcs/countryList.txt +170 -0
- docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt +737 -0
- docStore/sample/South Africa_s Low Emission Development Strategy.txt +0 -0
- docStore/sample/files.json +3 -0
- docStore/sample/keywordexample.json +8 -0
- packages.txt +4 -0
- paramconfig.cfg +47 -0
- requirements.txt +19 -0
- style.css +180 -0
- utils/__init__.py +1 -0
- utils/checkconfig.py +15 -0
- utils/keyword_extraction.py +140 -0
- utils/lexical_search.py +251 -0
- utils/ndc_explorer.py +90 -0
- utils/preprocessing.py +260 -0
- utils/sdg_classifier.py +177 -0
- utils/semantic_search.py +582 -0
- utils/streamlitcheck.py +42 -0
- utils/uploadAndExample.py +33 -0
- ver0.1 scripts/cleaning.py +168 -0
- ver0.1 scripts/coherence.py +267 -0
- ver0.1 scripts/docPreprocessing.py +75 -0
- ver0.1 scripts/keyword_search.py +169 -0
- ver0.1 scripts/sdg.py +57 -0
- ver0.1 scripts/sdg_analysis.py +160 -0
- ver0.1 scripts/search.py +141 -0
- ver0.1 scripts/uploadAndExample.py +52 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitattributes
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
23 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
32 |
+
appStore/img/giz_sdsn.jpg filter=lfs diff=lfs merge=lfs -text
|
33 |
+
appStore/img/paris.png filter=lfs diff=lfs merge=lfs -text
|
34 |
+
appStore/img/pic1.png filter=lfs diff=lfs merge=lfs -text
|
Policy-Action-Tracker_Concept-Note.pdf
ADDED
Binary file (154 kB). View file
|
|
README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: SDSN Demo
|
3 |
+
emoji: 📈
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: blue
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.10.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
duplicated_from: GIZ/SDSN-demo
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import appStore.keyword_search as keyword_search
|
2 |
+
import appStore.sdg_analysis as sdg_analysis
|
3 |
+
import appStore.coherence as coherence
|
4 |
+
import appStore.info as info
|
5 |
+
from appStore.multiapp import MultiApp
|
6 |
+
import streamlit as st
|
7 |
+
|
8 |
+
st.set_page_config(page_title = 'Climate Policy Intelligence',
|
9 |
+
initial_sidebar_state='expanded', layout="wide")
|
10 |
+
|
11 |
+
app = MultiApp()
|
12 |
+
|
13 |
+
app.add_app("About","house", info.app)
|
14 |
+
app.add_app("Search","search", keyword_search.app)
|
15 |
+
app.add_app("SDG Analysis","gear",sdg_analysis.app)
|
16 |
+
app.add_app("NDC Comparison","exclude", coherence.app)
|
17 |
+
|
18 |
+
app.run()
|
appStore/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# creating appstore package
|
appStore/coherence.py
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
import ast
|
7 |
+
import logging
|
8 |
+
from utils.ndc_explorer import countrySpecificCCA, countrySpecificCCM
|
9 |
+
from utils.checkconfig import getconfig
|
10 |
+
from utils.semantic_search import runSemanticPreprocessingPipeline,process_semantic_output
|
11 |
+
from utils.semantic_search import semanticSearchPipeline, runSemanticPipeline
|
12 |
+
from st_aggrid import AgGrid
|
13 |
+
from st_aggrid.shared import ColumnsAutoSizeMode
|
14 |
+
|
15 |
+
# Reading data and Declaring necessary variables
|
16 |
+
with open('docStore/ndcs/countryList.txt') as dfile:
|
17 |
+
countryList = dfile.read()
|
18 |
+
countryList = ast.literal_eval(countryList)
|
19 |
+
countrynames = list(countryList.keys())
|
20 |
+
|
21 |
+
with open('docStore/ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
|
22 |
+
cca_sent = dfile.read()
|
23 |
+
cca_sent = ast.literal_eval(cca_sent)
|
24 |
+
|
25 |
+
with open('docStore/ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
|
26 |
+
ccm_sent = dfile.read()
|
27 |
+
ccm_sent = ast.literal_eval(ccm_sent)
|
28 |
+
|
29 |
+
config = getconfig('paramconfig.cfg')
|
30 |
+
split_by = config.get('coherence','SPLIT_BY')
|
31 |
+
split_length = int(config.get('coherence','SPLIT_LENGTH'))
|
32 |
+
split_overlap = int(config.get('coherence','SPLIT_OVERLAP'))
|
33 |
+
split_respect_sentence_boundary = bool(int(config.get('coherence',
|
34 |
+
'RESPECT_SENTENCE_BOUNDARY')))
|
35 |
+
remove_punc = bool(int(config.get('coherence','REMOVE_PUNC')))
|
36 |
+
embedding_model = config.get('coherence','RETRIEVER')
|
37 |
+
embedding_model_format = config.get('coherence','RETRIEVER_FORMAT')
|
38 |
+
embedding_layer = int(config.get('coherence','RETRIEVER_EMB_LAYER'))
|
39 |
+
embedding_dim = int(config.get('coherence','EMBEDDING_DIM'))
|
40 |
+
max_seq_len = int(config.get('coherence','MAX_SEQ_LENGTH'))
|
41 |
+
retriever_top_k = int(config.get('coherence','RETRIEVER_TOP_K'))
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
def app():
|
46 |
+
|
47 |
+
#### APP INFO #####
|
48 |
+
with st.container():
|
49 |
+
st.markdown("<h1 style='text-align: center; \
|
50 |
+
color: black;'> NDC Comparison</h1>",
|
51 |
+
unsafe_allow_html=True)
|
52 |
+
st.write(' ')
|
53 |
+
st.write(' ')
|
54 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
55 |
+
|
56 |
+
st.write(
|
57 |
+
"""
|
58 |
+
The *NDC Comparison* application provides easy evaluation of
|
59 |
+
coherence between a given policy document and a country’s (Intended)\
|
60 |
+
Nationally Determined Contribution (INDCs/NDCs) using open-source \
|
61 |
+
data from the German Institute of Development and Sustainability’s \
|
62 |
+
(IDOS) [NDC Explorer](https://klimalog.idos-research.de/ndc/#NDCExplorer/worldMap?NewAndUpdatedNDC??income???catIncome).\
|
63 |
+
""")
|
64 |
+
st.write("")
|
65 |
+
st.write(""" User can select a country context via the drop-down menu \
|
66 |
+
on the left-hand side of the application. Subsequently, the user is \
|
67 |
+
given the opportunity to manually upload another policy document \
|
68 |
+
from the same national context or to select a pre-loaded example \
|
69 |
+
document. Thereafter, the user can choose between two categories \
|
70 |
+
to compare coherence between the documents: climate change adaptation \
|
71 |
+
and climate change mitigation. Based on the selected information, \
|
72 |
+
the application identifies relevant paragraphs in the uploaded \
|
73 |
+
document and assigns them to the respective indicator from the NDC \
|
74 |
+
Explorer. Currently, the NDC Explorer has 20 indicators under \
|
75 |
+
climate change mitigation (e.g., fossil fuel production, REDD+) and \
|
76 |
+
22 indicators under climate change adaptation (e.g., sea level rise,\
|
77 |
+
investment needs). The assignment of the paragraph to a corresponding\
|
78 |
+
indicator is based on vector similarities in which top 3 results
|
79 |
+
if found are shown to the user. """)
|
80 |
+
st.write("")
|
81 |
+
st.write("")
|
82 |
+
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
83 |
+
col1,col2= st.columns(2)
|
84 |
+
with col1:
|
85 |
+
st.caption("OCR File processing")
|
86 |
+
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
87 |
+
st.write("50 sec")
|
88 |
+
|
89 |
+
with col2:
|
90 |
+
st.caption("NDC comparison on 200 paragraphs(~ 35 pages)")
|
91 |
+
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
92 |
+
st.write("140 sec")
|
93 |
+
|
94 |
+
with st.sidebar:
|
95 |
+
|
96 |
+
option = st.selectbox('Select Country', (countrynames))
|
97 |
+
countryCode = countryList[option]
|
98 |
+
st.markdown("---")
|
99 |
+
|
100 |
+
genre = st.radio( "Select Category",('Climate Change Adaptation',
|
101 |
+
'Climate Change Mitigation'))
|
102 |
+
st.markdown("---")
|
103 |
+
|
104 |
+
with st.container():
|
105 |
+
if st.button("Compare with NDC"):
|
106 |
+
sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
|
107 |
+
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
|
108 |
+
|
109 |
+
if 'filepath' in st.session_state:
|
110 |
+
allDocuments = runSemanticPreprocessingPipeline(
|
111 |
+
file_path= st.session_state['filepath'],
|
112 |
+
file_name = st.session_state['filename'],
|
113 |
+
split_by=split_by,
|
114 |
+
split_length= split_length,
|
115 |
+
split_overlap=split_overlap,
|
116 |
+
remove_punc= remove_punc,
|
117 |
+
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
118 |
+
# genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
|
119 |
+
if genre == 'Climate Change Adaptation':
|
120 |
+
sent_dict = sent_cca
|
121 |
+
else:
|
122 |
+
sent_dict = sent_ccm
|
123 |
+
sent_labels = []
|
124 |
+
for key,sent in sent_dict.items():
|
125 |
+
sent_labels.append(sent)
|
126 |
+
if len(allDocuments['documents']) > 100:
|
127 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
128 |
+
else:
|
129 |
+
warning_msg = ""
|
130 |
+
logging.info("starting Coherence analysis, \
|
131 |
+
country selected {}".format(option))
|
132 |
+
with st.spinner("Performing Coherence Analysis for {} \
|
133 |
+
under {} category{}".format(option,genre,warning_msg)):
|
134 |
+
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = allDocuments['documents'],
|
135 |
+
embedding_model= embedding_model,
|
136 |
+
embedding_layer= embedding_layer,
|
137 |
+
embedding_model_format= embedding_model_format,
|
138 |
+
retriever_top_k= retriever_top_k,
|
139 |
+
embedding_dim=embedding_dim,
|
140 |
+
max_seq_len=max_seq_len, useQueryCheck=False)
|
141 |
+
raw_output = runSemanticPipeline(pipeline=semanticsearch_pipeline,queries=sent_labels)
|
142 |
+
results_df = process_semantic_output(raw_output)
|
143 |
+
results_df = results_df.drop(['answer','answer_offset',
|
144 |
+
'context_offset','context','reader_score','id'],
|
145 |
+
axis = 1)
|
146 |
+
|
147 |
+
for i,key in enumerate(list(sent_dict.keys())):
|
148 |
+
st.subheader("Relevant paragraphs for topic: {}".format(key))
|
149 |
+
df = results_df[results_df['query']==sent_dict[key]].reset_index(drop=True)
|
150 |
+
for j in range(3):
|
151 |
+
st.write('Result {}.'.format(j+1))
|
152 |
+
st.write(df.loc[j]['content']+'\n')
|
153 |
+
|
154 |
+
else:
|
155 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
156 |
+
logging.warning("Terminated as no document provided")
|
appStore/info.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def app():
|
4 |
+
|
5 |
+
|
6 |
+
with open('style.css') as f:
|
7 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
8 |
+
|
9 |
+
st.markdown("<h2 style='text-align: center; \
|
10 |
+
color: black;'> Policy Action Tracker</h2>",
|
11 |
+
unsafe_allow_html=True)
|
12 |
+
|
13 |
+
|
14 |
+
st.markdown("<div style='text-align: center; \
|
15 |
+
color: grey;'>The Policy Action Tracker is an open-source\
|
16 |
+
digital tool which aims to assist policy analysts and \
|
17 |
+
other users in extracting and filtering relevant \
|
18 |
+
information from policy documents.</div>",
|
19 |
+
unsafe_allow_html=True)
|
20 |
+
footer = """
|
21 |
+
<div class="footer-custom">
|
22 |
+
Guidance & Feedback - <a href="https://www.linkedin.com/in/maren-bernlöhr-149891222" target="_blank">Maren Bernlöhr</a> |
|
23 |
+
<a href="https://www.linkedin.com/in/manuelkuhm" target="_blank">Manuel Kuhm</a> |
|
24 |
+
Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
|
25 |
+
<a href="https://www.linkedin.com/in/jonas-nothnagel-bb42b114b/" target="_blank">Jonas Nothnagel</a> |
|
26 |
+
<a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
|
27 |
+
|
28 |
+
</div>
|
29 |
+
"""
|
30 |
+
st.markdown(footer, unsafe_allow_html=True)
|
31 |
+
|
32 |
+
c1, c2, c3 = st.columns([8,1,12])
|
33 |
+
with c1:
|
34 |
+
st.image("docStore/img/ndc.png")
|
35 |
+
with c3:
|
36 |
+
st.markdown('<div style="text-align: justify;">The manual extraction \
|
37 |
+
of relevant information from text documents is a \
|
38 |
+
time-consuming task for any policy analyst. As the amount and length of \
|
39 |
+
public policy documents in relation to sustainable development (such as \
|
40 |
+
National Development Plans and Nationally Determined Contributions) \
|
41 |
+
continuously increases, a major challenge for policy action tracking – the \
|
42 |
+
evaluation of stated goals and targets and their actual implementation on \
|
43 |
+
the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
|
44 |
+
Language Processing (NLP) methods can help in shortening and easing this \
|
45 |
+
task for policy analysts.</div><br>',
|
46 |
+
unsafe_allow_html=True)
|
47 |
+
|
48 |
+
intro = """
|
49 |
+
<div style="text-align: justify;">
|
50 |
+
|
51 |
+
For this purpose, the United Nations Sustainable Development Solutions \
|
52 |
+
Network (SDSN) and the Deutsche Gesellschaft für Internationale \
|
53 |
+
Zusammenarbeit (GIZ) GmbH are collaborated in the development \
|
54 |
+
of this AI-powered open-source web application that helps find and extract \
|
55 |
+
relevant information from public policy documents faster to facilitate \
|
56 |
+
evidence-based decision-making processes in sustainable development and beyond.
|
57 |
+
|
58 |
+
This tool allows policy analysts and other users the possibility to rapidly \
|
59 |
+
search for relevant information/paragraphs in the document according to the \
|
60 |
+
user’s interest, classify the document’s content according to the Sustainable \
|
61 |
+
Development Goals (SDGs), and compare climate-related policy documents and NDCs \
|
62 |
+
across countries using open data from the German Institute of Development and \
|
63 |
+
Sustainability’s (IDOS) NDC Explorer.
|
64 |
+
To understand the application's functionalities and learn more about ß
|
65 |
+
the project, see the attached concept note. We hope you like our application 😊
|
66 |
+
|
67 |
+
|
68 |
+
</div>
|
69 |
+
<br>
|
70 |
+
"""
|
71 |
+
st.markdown(intro, unsafe_allow_html=True)
|
72 |
+
# st.image("docStore/img/paris.png")
|
appStore/keyword_search.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
import json
|
7 |
+
import logging
|
8 |
+
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
+
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
|
10 |
+
from utils.checkconfig import getconfig
|
11 |
+
from utils.streamlitcheck import checkbox_without_preselect
|
12 |
+
|
13 |
+
# Declare all the necessary variables
|
14 |
+
config = getconfig('paramconfig.cfg')
|
15 |
+
split_by = config.get('semantic_search','SPLIT_BY')
|
16 |
+
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
17 |
+
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
18 |
+
split_respect_sentence_boundary = bool(int(config.get('semantic_search',
|
19 |
+
'RESPECT_SENTENCE_BOUNDARY')))
|
20 |
+
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
|
21 |
+
embedding_model = config.get('semantic_search','RETRIEVER')
|
22 |
+
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
23 |
+
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
24 |
+
embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
|
25 |
+
max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
|
26 |
+
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
27 |
+
reader_model = config.get('semantic_search','READER')
|
28 |
+
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
29 |
+
top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE'))
|
30 |
+
lexical_split_by= config.get('lexical_search','SPLIT_BY')
|
31 |
+
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
|
32 |
+
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
33 |
+
lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
|
34 |
+
lexical_top_k=int(config.get('lexical_search','TOP_K'))
|
35 |
+
|
36 |
+
def app():
|
37 |
+
|
38 |
+
with st.container():
|
39 |
+
st.markdown("<h1 style='text-align: center; \
|
40 |
+
color: black;'> Search</h1>",
|
41 |
+
unsafe_allow_html=True)
|
42 |
+
st.write(' ')
|
43 |
+
st.write(' ')
|
44 |
+
|
45 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
46 |
+
|
47 |
+
st.write(
|
48 |
+
"""
|
49 |
+
The *Search* app is an interface \
|
50 |
+
for doing contextual and keyword searches in \
|
51 |
+
policy documents. \
|
52 |
+
""")
|
53 |
+
st.write("")
|
54 |
+
st.write(""" The application allows its user to perform a search\
|
55 |
+
based on two options: a lexical search([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
|
56 |
+
and semantic search. [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\
|
57 |
+
The lexical search only \
|
58 |
+
displays paragraphs in the document with exact matching results, \
|
59 |
+
the semantic search shows paragraphs with meaningful connections \
|
60 |
+
(e.g., synonyms) based on the search context. Both \
|
61 |
+
methods employ a probabilistic retrieval framework in its identification\
|
62 |
+
of relevant paragraphs. By defualt the search is performed using \
|
63 |
+
'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
|
64 |
+
checkbox provided which will by-pass semantic search. Furthermore,\
|
65 |
+
the application allows the user to search for pre-defined keywords \
|
66 |
+
from different thematic buckets present in sidebar.""")
|
67 |
+
st.write("")
|
68 |
+
st.write(""" The Exact Matches gives back top {} findings, and Semantic
|
69 |
+
search provides with top {} answers.""".format(lexical_top_k, retriever_top_k))
|
70 |
+
st.write("")
|
71 |
+
st.write("")
|
72 |
+
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
73 |
+
col1,col2,col3= st.columns([2,4,4])
|
74 |
+
with col1:
|
75 |
+
st.caption("OCR File processing")
|
76 |
+
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
77 |
+
st.write("50 sec")
|
78 |
+
|
79 |
+
with col2:
|
80 |
+
st.caption("Lexical Search on 200 paragraphs(~ 35 pages)")
|
81 |
+
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
82 |
+
st.write("15 sec")
|
83 |
+
|
84 |
+
with col3:
|
85 |
+
st.caption("Semantic search on 200 paragraphs(~ 35 pages)")
|
86 |
+
# st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
|
87 |
+
st.write("120 sec(including emebedding creation)")
|
88 |
+
|
89 |
+
with st.sidebar:
|
90 |
+
with open('docStore/sample/keywordexample.json','r') as json_file:
|
91 |
+
keywordexample = json.load(json_file)
|
92 |
+
|
93 |
+
# genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
94 |
+
st.caption("Select Keyword Category")
|
95 |
+
genre = checkbox_without_preselect(list(keywordexample.keys()))
|
96 |
+
if genre:
|
97 |
+
keywordList = keywordexample[genre]
|
98 |
+
else:
|
99 |
+
keywordList = None
|
100 |
+
|
101 |
+
st.markdown("---")
|
102 |
+
|
103 |
+
with st.container():
|
104 |
+
type_hinting = "Please enter here your question and we \
|
105 |
+
will look for an answer in the document\
|
106 |
+
OR enter the keyword you are looking \
|
107 |
+
for and we will look for similar\
|
108 |
+
context in the document.\
|
109 |
+
You can also explore predefined sets of keywords from sidebar. "
|
110 |
+
if keywordList is not None:
|
111 |
+
# queryList = st.text_input("You selected the {} category we \
|
112 |
+
# will look for these keywords in document".format(genre)
|
113 |
+
# value="{}".format(keywordList))
|
114 |
+
queryList = st.text_input(type_hinting,
|
115 |
+
value = "{}".format(keywordList))
|
116 |
+
else:
|
117 |
+
queryList = st.text_input(type_hinting,
|
118 |
+
placeholder="Enter keyword/query here")
|
119 |
+
|
120 |
+
searchtype = st.checkbox("Show only Exact Matches")
|
121 |
+
if st.button("Find them"):
|
122 |
+
|
123 |
+
if queryList == "":
|
124 |
+
st.info("🤔 No keyword provided, if you dont have any, \
|
125 |
+
please try example sets from sidebar!")
|
126 |
+
logging.warning("Terminated as no keyword provided")
|
127 |
+
else:
|
128 |
+
if 'filepath' in st.session_state:
|
129 |
+
|
130 |
+
if searchtype:
|
131 |
+
all_documents = runLexicalPreprocessingPipeline(
|
132 |
+
file_name=st.session_state['filename'],
|
133 |
+
file_path=st.session_state['filepath'],
|
134 |
+
split_by=lexical_split_by,
|
135 |
+
split_length=lexical_split_length,
|
136 |
+
split_overlap=lexical_split_overlap,
|
137 |
+
remove_punc=lexical_remove_punc)
|
138 |
+
logging.info("performing lexical search")
|
139 |
+
with st.spinner("Performing Exact matching search \
|
140 |
+
(Lexical search) for you"):
|
141 |
+
lexical_search(query=queryList,
|
142 |
+
documents = all_documents['documents'],
|
143 |
+
top_k = lexical_top_k )
|
144 |
+
else:
|
145 |
+
all_documents = runSemanticPreprocessingPipeline(
|
146 |
+
file_path= st.session_state['filepath'],
|
147 |
+
file_name = st.session_state['filename'],
|
148 |
+
split_by=split_by,
|
149 |
+
split_length= split_length,
|
150 |
+
split_overlap=split_overlap,
|
151 |
+
remove_punc= remove_punc,
|
152 |
+
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
153 |
+
if len(all_documents['documents']) > 100:
|
154 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
155 |
+
else:
|
156 |
+
warning_msg = ""
|
157 |
+
|
158 |
+
logging.info("starting semantic search")
|
159 |
+
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
|
160 |
+
semantic_keywordsearch(query = queryList,
|
161 |
+
documents = all_documents['documents'],
|
162 |
+
embedding_model=embedding_model,
|
163 |
+
embedding_layer=embedding_layer,
|
164 |
+
embedding_model_format=embedding_model_format,
|
165 |
+
reader_model=reader_model,reader_top_k=reader_top_k,
|
166 |
+
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
|
167 |
+
max_seq_len=max_seq_len,
|
168 |
+
top_k_per_candidate = top_k_per_candidate)
|
169 |
+
|
170 |
+
else:
|
171 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
172 |
+
logging.warning("Terminated as no document provided")
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
appStore/multiapp.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Frameworks for running multiple Streamlit applications as a single app.
|
2 |
+
"""
|
3 |
+
import streamlit as st
|
4 |
+
from PIL import Image
|
5 |
+
from streamlit_option_menu import option_menu
|
6 |
+
from utils.uploadAndExample import add_upload
|
7 |
+
|
8 |
+
class MultiApp:
|
9 |
+
"""Framework for combining multiple streamlit applications.
|
10 |
+
Usage:
|
11 |
+
def foo():
|
12 |
+
st.title("Hello Foo")
|
13 |
+
def bar():
|
14 |
+
st.title("Hello Bar")
|
15 |
+
app = MultiApp()
|
16 |
+
app.add_app("Foo", foo)
|
17 |
+
app.add_app("Bar", bar)
|
18 |
+
app.run()
|
19 |
+
It is also possible keep each application in a separate file.
|
20 |
+
import foo
|
21 |
+
import bar
|
22 |
+
app = MultiApp()
|
23 |
+
app.add_app("Foo", foo.app)
|
24 |
+
app.add_app("Bar", bar.app)
|
25 |
+
app.run()
|
26 |
+
"""
|
27 |
+
def __init__(self):
|
28 |
+
self.apps = []
|
29 |
+
|
30 |
+
def add_app(self,title,icon, func):
|
31 |
+
"""Adds a new application.
|
32 |
+
Parameters
|
33 |
+
----------
|
34 |
+
func:
|
35 |
+
the python function to render this app.
|
36 |
+
title:
|
37 |
+
title of the app. Appears in the dropdown in the sidebar.
|
38 |
+
"""
|
39 |
+
self.apps.append({
|
40 |
+
"title": title,
|
41 |
+
"icon": icon,
|
42 |
+
"function": func
|
43 |
+
})
|
44 |
+
|
45 |
+
def run(self):
|
46 |
+
|
47 |
+
st.sidebar.write(format_func=lambda app: app['title'])
|
48 |
+
image = Image.open('docStore/img/sdsn.png')
|
49 |
+
st.sidebar.image(image, width =200)
|
50 |
+
|
51 |
+
with st.sidebar:
|
52 |
+
selected = option_menu(None, [page["title"] for page in self.apps],
|
53 |
+
icons=[page["icon"] for page in self.apps],
|
54 |
+
menu_icon="cast", default_index=0)
|
55 |
+
st.markdown("---")
|
56 |
+
|
57 |
+
|
58 |
+
for index, item in enumerate(self.apps):
|
59 |
+
if item["title"] == selected:
|
60 |
+
self.apps[index]["function"]()
|
61 |
+
break
|
62 |
+
|
63 |
+
|
64 |
+
choice = st.sidebar.radio(label = 'Select the Document',
|
65 |
+
help = 'You can upload the document \
|
66 |
+
or else you can try a example document',
|
67 |
+
options = ('Upload Document', 'Try Example'),
|
68 |
+
horizontal = True)
|
69 |
+
add_upload(choice)
|
70 |
+
|
appStore/sdg_analysis.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../utils')
|
4 |
+
|
5 |
+
#import needed libraries
|
6 |
+
import seaborn as sns
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import streamlit as st
|
11 |
+
from st_aggrid import AgGrid
|
12 |
+
from st_aggrid.shared import ColumnsAutoSizeMode
|
13 |
+
from utils.sdg_classifier import sdg_classification
|
14 |
+
from utils.sdg_classifier import runSDGPreprocessingPipeline, load_sdgClassifier
|
15 |
+
from utils.keyword_extraction import textrank
|
16 |
+
import logging
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
from utils.checkconfig import getconfig
|
19 |
+
|
20 |
+
|
21 |
+
# Declare all the necessary variables
|
22 |
+
config = getconfig('paramconfig.cfg')
|
23 |
+
model_name = config.get('sdg','MODEL')
|
24 |
+
split_by = config.get('sdg','SPLIT_BY')
|
25 |
+
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
26 |
+
split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
|
27 |
+
remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
|
28 |
+
split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
|
29 |
+
threshold = float(config.get('sdg','THRESHOLD'))
|
30 |
+
top_n = int(config.get('sdg','TOP_KEY'))
|
31 |
+
|
32 |
+
|
33 |
+
def app():
|
34 |
+
|
35 |
+
#### APP INFO #####
|
36 |
+
with st.container():
|
37 |
+
st.markdown("<h1 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h1>", unsafe_allow_html=True)
|
38 |
+
st.write(' ')
|
39 |
+
st.write(' ')
|
40 |
+
|
41 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
42 |
+
|
43 |
+
st.write(
|
44 |
+
"""
|
45 |
+
The *SDG Analysis* app is an easy-to-use interface built \
|
46 |
+
in Streamlit for analyzing policy documents with respect to SDG \
|
47 |
+
Classification for the paragraphs/texts in the document and \
|
48 |
+
extracting the keyphrase per SDG label - developed by GIZ Data \
|
49 |
+
and the Sustainable Development Solution Network. \n
|
50 |
+
""")
|
51 |
+
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
52 |
+
automatically cleaned and split into paragraphs with a maximum \
|
53 |
+
length of 120 words using a Haystack preprocessing pipeline. The \
|
54 |
+
length of 120 is an empirical value which should reflect the length \
|
55 |
+
of a “context” and should limit the paragraph length deviation. \
|
56 |
+
However, since we want to respect the sentence boundary the limit \
|
57 |
+
can breach and hence this limit of 120 is tentative. \n
|
58 |
+
""")
|
59 |
+
st.write("""**SDG cLassification:** The application assigns paragraphs \
|
60 |
+
to 16 of the 17 United Nations Sustainable Development Goals (SDGs).\
|
61 |
+
SDG 17 “Partnerships for the Goals” is excluded from the analysis due \
|
62 |
+
to its broad nature which could potentially inflate the results. \
|
63 |
+
Each paragraph is assigned to one SDG only. Again, the results are \
|
64 |
+
displayed in a summary table including the number of the SDG, a \
|
65 |
+
relevancy score highlighted through a green color shading, and the \
|
66 |
+
respective text of the analyzed paragraph. Additionally, a pie \
|
67 |
+
chart with a blue color shading is displayed which illustrates the \
|
68 |
+
three most prominent SDGs in the document. The SDG classification \
|
69 |
+
uses open-source training [data](https://zenodo.org/record/5550238#.Y25ICHbMJPY) \
|
70 |
+
from [OSDG.ai](https://osdg.ai/) which is a global \
|
71 |
+
partnerships and growing community of researchers and institutions \
|
72 |
+
interested in the classification of research according to the \
|
73 |
+
Sustainable Development Goals. The summary table only displays \
|
74 |
+
paragraphs with a calculated relevancy score above 85%. \n""")
|
75 |
+
|
76 |
+
st.write("""**Keyphrase Extraction:** The application extracts 15 \
|
77 |
+
keyphrases from the document, for each SDG label and displays the \
|
78 |
+
results in a summary table. The keyphrases are extracted using \
|
79 |
+
using [Textrank](https://github.com/summanlp/textrank)\
|
80 |
+
which is an easy-to-use computational less expensive \
|
81 |
+
model leveraging combination of TFIDF and Graph networks.
|
82 |
+
""")
|
83 |
+
st.write("")
|
84 |
+
st.write("")
|
85 |
+
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
86 |
+
col1,col2,col3,col4 = st.columns([2,2,4,4])
|
87 |
+
with col1:
|
88 |
+
st.caption("Loading Time Classifier")
|
89 |
+
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
90 |
+
st.write("12 sec")
|
91 |
+
with col2:
|
92 |
+
st.caption("OCR File processing")
|
93 |
+
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
94 |
+
st.write("50 sec")
|
95 |
+
with col3:
|
96 |
+
st.caption("SDG Classification of 200 paragraphs(~ 35 pages)")
|
97 |
+
# st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
|
98 |
+
st.write("120 sec")
|
99 |
+
with col4:
|
100 |
+
st.caption("Keyword extraction for 200 paragraphs(~ 35 pages)")
|
101 |
+
# st.markdown('<div style="text-align: center;">3 sec</div>', unsafe_allow_html=True)
|
102 |
+
st.write("3 sec")
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
### Main app code ###
|
108 |
+
with st.container():
|
109 |
+
if st.button("RUN SDG Analysis"):
|
110 |
+
|
111 |
+
if 'filepath' in st.session_state:
|
112 |
+
file_name = st.session_state['filename']
|
113 |
+
file_path = st.session_state['filepath']
|
114 |
+
classifier = load_sdgClassifier(classifier_name=model_name)
|
115 |
+
st.session_state['sdg_classifier'] = classifier
|
116 |
+
all_documents = runSDGPreprocessingPipeline(file_name= file_name,
|
117 |
+
file_path= file_path, split_by= split_by,
|
118 |
+
split_length= split_length,
|
119 |
+
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
120 |
+
split_overlap= split_overlap, remove_punc= remove_punc)
|
121 |
+
|
122 |
+
if len(all_documents['documents']) > 100:
|
123 |
+
warning_msg = ": This might take sometime, please sit back and relax."
|
124 |
+
else:
|
125 |
+
warning_msg = ""
|
126 |
+
|
127 |
+
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
128 |
+
|
129 |
+
df, x = sdg_classification(haystack_doc=all_documents['documents'],
|
130 |
+
threshold= threshold)
|
131 |
+
df = df.drop(['Relevancy'], axis = 1)
|
132 |
+
sdg_labels = x.SDG.unique()
|
133 |
+
textrank_keyword_list = []
|
134 |
+
for label in sdg_labels:
|
135 |
+
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
136 |
+
textranklist_ = textrank(textdata=sdgdata, words= top_n)
|
137 |
+
if len(textranklist_) > 0:
|
138 |
+
textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
|
139 |
+
textrank_keywords_df = pd.DataFrame(textrank_keyword_list)
|
140 |
+
|
141 |
+
|
142 |
+
plt.rcParams['font.size'] = 25
|
143 |
+
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
144 |
+
# plot
|
145 |
+
fig, ax = plt.subplots()
|
146 |
+
ax.pie(x['count'], colors=colors, radius=2, center=(4, 4),
|
147 |
+
wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
148 |
+
textprops={'fontsize': 14},
|
149 |
+
frame=False,labels =list(x.SDG_Num),
|
150 |
+
labeldistance=1.2)
|
151 |
+
# fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
|
152 |
+
|
153 |
+
|
154 |
+
st.markdown("#### Anything related to SDGs? ####")
|
155 |
+
|
156 |
+
c4, c5, c6 = st.columns([1,2,2])
|
157 |
+
|
158 |
+
with c5:
|
159 |
+
st.pyplot(fig)
|
160 |
+
with c6:
|
161 |
+
labeldf = x['SDG_name'].values.tolist()
|
162 |
+
labeldf = "<br>".join(labeldf)
|
163 |
+
st.markdown(labeldf, unsafe_allow_html=True)
|
164 |
+
st.write("")
|
165 |
+
st.markdown("###### What keywords are present under SDG classified text? ######")
|
166 |
+
|
167 |
+
AgGrid(textrank_keywords_df, reload_data = False,
|
168 |
+
update_mode="value_changed",
|
169 |
+
columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
|
170 |
+
st.write("")
|
171 |
+
st.markdown("###### Top few SDG Classified paragraph/text results ######")
|
172 |
+
|
173 |
+
AgGrid(df, reload_data = False, update_mode="value_changed",
|
174 |
+
columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
|
175 |
+
else:
|
176 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
177 |
+
logging.warning("Terminated as no document provided")
|
178 |
+
|
179 |
+
|
docStore/img/giz_sdsn_small.jpg
ADDED
![]() |
docStore/img/ndc.png
ADDED
![]() |
docStore/img/paris.png
ADDED
![]() |
docStore/img/sdsn.png
ADDED
![]() |
docStore/ndcs/cca.txt
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"climate_risks_droughts": {"category": "climate change adaptation","id": {0:"(I)NDC not submitted or not yet included in analysis",
|
2 |
+
1: "Droughts are not climate risks concerns",
|
3 |
+
2: "Droughts are among the five climate risks concerns"}},
|
4 |
+
"climate_risks_extreme_weather": {"category": "climate change adaptation", "id": {0:"(I)NDC not submitted or not yet included in analysis",
|
5 |
+
1: "Extreme Weathers are not climate risks concerns",
|
6 |
+
2: "Extreme Weathers are among the five climate risks concerns"}},
|
7 |
+
"climate_risks_floods": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
8 |
+
1: "Floods are not climate risks concerns",
|
9 |
+
2: "Floods are among the five climate risks concerns"}},
|
10 |
+
"climate_risks_temp_increase": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
11 |
+
1: "Temperature increase are not climate risks concerns",
|
12 |
+
2: "Temperature increase are among the five climate risks concerns"}},
|
13 |
+
"climate_risks_sea_level_rise": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
14 |
+
1: "Sea level rise is not a climate risks concerns",
|
15 |
+
2: "Sea level rise is among the five climate risks concerns"}},
|
16 |
+
|
17 |
+
"priority_sectors_agriculture": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
18 |
+
1: "Agricultural sector is not that important in the context of adaptation ambitions",
|
19 |
+
2: "In the context of adaptation ambitions Agricultural sector is very important for the country",
|
20 |
+
3: "Agriculture sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
|
21 |
+
|
22 |
+
"priority_sectors_ecosystems": {"category": "climate change adaptation","id": {0 :"(I)NDC not submitted or not yet included in analysis",
|
23 |
+
1 :"Biodiversity and preservation of Ecosystems is not that important in the context of adaptation ambitions",
|
24 |
+
2: "In the context of adaptation ambitions Biodiversity and preservation of Ecosystems is very important for the country",
|
25 |
+
3: "Biodiversity and Ecosystems plays an importance for the country, and therefore in the adaptation ambitions Biodiversity and Ecosystems has special actions and aims"}},
|
26 |
+
"priority_sectors_forestry": {"category": "climate change adaptation", "id": {0: "(I)NDC not submitted or not yet included in analysis",
|
27 |
+
1: "Forestry sector is not that important in the context of adaptation ambitions",
|
28 |
+
2: "In the context of adaptation ambitions Forestry sector is very important for the country",
|
29 |
+
3: "Forestry sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
|
30 |
+
"priority_sectors_health": {"category": "climate change adaptation","id": { 0: "(I)NDC not submitted or not yet included in analysis",
|
31 |
+
1: "Health sector is not that important in the context of adaptation ambitions",
|
32 |
+
2: "In the context of adaptation ambitions Health sector is very important for the country",
|
33 |
+
3: "Health sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
|
34 |
+
|
35 |
+
"priority_sectors_water": {"category": "climate change adaptation","id": { 0 : "(I)NDC not submitted or not yet included in analysis",
|
36 |
+
1: "Water sector is not that important in the context of adaptation ambitions",
|
37 |
+
2: "In the context of adaptation ambitions Water sector is very important for the country",
|
38 |
+
3: "Water sector plays an importance for the country, and therefore in the adaptation ambitions Agriculture sector has special actions and aims"}},
|
39 |
+
|
40 |
+
"vulnerability_agriculture": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
41 |
+
1: "Agriculture is a not a vulnerable sector",
|
42 |
+
2: "Agriculture is a vulnerable sector"}},
|
43 |
+
"vulnerability_coastal_zones": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
44 |
+
1: "Coastal Zone is a not a vulnerable sector",
|
45 |
+
2: "Coastal Zone is a vulnerable sector"}},
|
46 |
+
"vulnerability_ecosystems": {"category": "climate change adaptation", "id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
47 |
+
1: "Biodiversity and Ecosystems is a not a vulnerable sector",
|
48 |
+
2: "Biodiversity and Ecosystems is a vulnerable sector"}},
|
49 |
+
"vulnerability_health": {"category": "climate change adaptation","id": {0:"(I)NDC not submitted or not yet included in analysis",
|
50 |
+
1: "Health is a not a vulnerable sector",
|
51 |
+
2: "Health is a vulnerable sector"}},
|
52 |
+
"vulnerability_water": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
53 |
+
1: "Water is a not a vulnerable sector",
|
54 |
+
2: "Water is a vulnerable sector"}},
|
55 |
+
|
56 |
+
"costs_of_adaptation": {"category": "climate change adaptation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
57 |
+
1: "The partial cost of adaptation is tentatively around few million dollars",
|
58 |
+
2: " The cost of adaptation will be 0-1 billion US$ until 2030",
|
59 |
+
3: " The cost of adaptation will be 1-5 billion US$ until 2030",
|
60 |
+
4: " The cost of adaptation will be 5-10 billion US$ until 2030",
|
61 |
+
5: " The cost of adaptation will be 10-20 billion US$ until 2030",
|
62 |
+
6: "The cost of adaptation will be more than 20 billion US$ until 2030"}},
|
63 |
+
"costs_of_future_climate_related_hazards": {"category": "climate change adaptation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
64 |
+
1: "The future losses from climate change will be huge",
|
65 |
+
2: "The climate hazards cause significant loss to economy and life, and the cost of Future losses could go around few million dollars"}},
|
66 |
+
|
67 |
+
"costs_of_recent_climate_related_hazards": {"category": "climate change adaptation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
68 |
+
1: "No losses indicated",
|
69 |
+
2: "In the recent climate hazards there has been significant Economic losses.",
|
70 |
+
3: "In the recent climate hazards the impact on human life has been significant",
|
71 |
+
4: "In the recent climate hazards the impact on human life has been significant and the economic loss amounts to 5.3"}},
|
72 |
+
"quantified_adaptation_targets": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
73 |
+
1:"No quantitative adaptation target",
|
74 |
+
2: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years",
|
75 |
+
3: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years",
|
76 |
+
4: "In this sector we aim to achieve the adaptation targets of 5.6 in coming few years"}},
|
77 |
+
|
78 |
+
"slow_onset_others": {"category": "climate change adaptation","id": {0: "(I)NDC not submitted or not yet included in analysis",
|
79 |
+
1:"Apart from sea level rise and temperature increase, no other specific slow onset process",
|
80 |
+
2: "There are other slow onset processes additional to sea level rise and temperature increase like loss of biodiversity, desertification, glacier retreat, salinisation or ocean acidification"}},
|
81 |
+
}
|
docStore/ndcs/ccm.txt
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"agriculture": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
2 |
+
1: "Agriculture sector is not considered for climate change mitigation",
|
3 |
+
2: "Agriculture sector contribution in greenhouse gases emission is significant and therefore is part of climate change mitigation",
|
4 |
+
3: "Agriculture sector contribution in greenhouse gases emission is significant. Given the importance of agriculture sector for economy and and its adverse contribution in greenhouse gas emissions it is a Focus area for climate change mitigation and needs to be prioritised"}},
|
5 |
+
|
6 |
+
"energy_efficiency": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
7 |
+
1: "Energy Efficiency is not considered for climate change mitigation",
|
8 |
+
2: "Energy sector contribution in greenhouse gases emission is significant and therefore Energy Efficiency is part of climate change mitigation",
|
9 |
+
3: "Energy sector contribution in greenhouse gases emission is significant. Given the importance of the energy sector for economy and its adverse contribution to greenhouse gas emissions, energy efficiency is a Focus area for climate change mitigation and needs to be prioritised. The quantified renewable energy targets like for example in solar, geothermal, wind power are provided."}},
|
10 |
+
|
11 |
+
"fossil_fuel_production": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
12 |
+
1:"There is no recorded FFP (2016)",
|
13 |
+
2: "Fossil fuel Production is important for economy",
|
14 |
+
3:"Fossil fuel Production is important to provide for the basic requirements of the people in the country",
|
15 |
+
4:"The country's dependence on Fossil fuel production to meet energy and other requirements cannot be ignored, however the climate change impact due to the same cannot be ignored. The plans and actions as part of climate change mitigation includes measures to address production (e.g. subsidy removal; taxes); cleaner production",
|
16 |
+
5: "Fossil fuel Production is important to provide for the basic requirements of the people in the country.The country's dependence on Fossil fuel production to meet energy and other requirements cannot be ignored, however the climate change impact due to same cannot be ignored. The plans and actions as part of climate change mitigation includes measures to address production (e.g. subsidy removal; taxes); cleaner production"}},
|
17 |
+
"fossil_fuel_subsidiaries": {"category": "climate change mitigation","id":{0: "(I)NDC not submitted or not yet included in analysis",
|
18 |
+
1:"fossil Fuel subsidiaries are not considered",
|
19 |
+
2:"the alternates/subsidiaries to fossil Fuel need to be considered to meet the mitigations ambitions",
|
20 |
+
3:"The fossil fuel contribution towards greenhouse gas emissions is very high and therefore there is a need to find the alternatives/substitutes for the same. The replacement of fossil fuels with alternates is a priority focus area in the mitigation actions to meet mitigation ambitions."}},
|
21 |
+
|
22 |
+
"land_use_and_forestry": {"category": "climate change mitigation", "id":{0:"(I)NDC not submitted or not yet included in analysis",
|
23 |
+
1:"land use and forestry are not considered",
|
24 |
+
2:"the land use and forestry contribute to greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
|
25 |
+
3:"The land use and forestry contribution towards greenhouse gas emissions is significant and therefore there is need to quantify the mitigation potential land use and forestry."}},
|
26 |
+
"land_use_change": {"category": "climate change mitigation", "id": {0:"(I)NDC not submitted or not yet included in analysis",
|
27 |
+
1: "land use change Not mentioned",
|
28 |
+
2: "land use change is being considered, but there are no mitigation targets",
|
29 |
+
3: "land use change is being considered as part of mitigation targets",
|
30 |
+
4: "land use change can play an important role in mitigation efforts. As part of mitigation plan there are quantified targets for land use change.",
|
31 |
+
5: "land use change can play an important role in mitigation efforts. As part of mitigation plan there are quantified targets for land use change."}},
|
32 |
+
|
33 |
+
"renewable_energy": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
34 |
+
1:"renewable energy is not considered",
|
35 |
+
2:"Renewable energy are direct measure to reduce the greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
|
36 |
+
3:"Renewable energy are direct measure to reduce the greenhouse gas emissions and therefore there is need to quantify the mitigation potential in terms of renewable energy targets and specific sub-sectors of action (e.g. solar, geothermal, wind power)"}},
|
37 |
+
|
38 |
+
"temp_target": {"category": "climate change mitigation", "id": { 0: "(I)NDC not submitted or not yet included in analysis",
|
39 |
+
1:"Not mentioning global effort to limit global temperature increase to 2 degree celsius or 1.5 degree C",
|
40 |
+
2:"there is urgent need to limit global temperature increase to 2 degree celsius",
|
41 |
+
3:"there is urgent need to limit global temperature increase to 1.5 degree C",
|
42 |
+
4:"there is urgent need to limit global temperature increase to 2 degree celsius",
|
43 |
+
5:"there is urgent need to limit global temperature increase to 1.5 degree C"}},
|
44 |
+
"waste": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
45 |
+
1:"Waste as a topic is not mentioned",
|
46 |
+
2:"Waste reduction or management can play important role in mitigation plan and ambitions",
|
47 |
+
3:"Waste reduction or management can play an important role in sustainable development and hence is a focus area in mitigation plan and ambitions"}},
|
48 |
+
"transport": {"category": "climate change mitigation","id": {0:"(I)NDC not submitted or not yet included in analysis",
|
49 |
+
1:"Transport is not considered",
|
50 |
+
2:"Transport contribute to greenhouse gas emissions and need to be considered to meet the mitigations ambitions",
|
51 |
+
3:"transport sector contribution towards greenhouse gas emissions is significant and therefore there is need to focus/prioritise the transport sector to meet the mitigation potential"}},
|
52 |
+
|
53 |
+
"reducing_non_co2_gases": {"category": "climate change mitigation","id": {0:"(I)NDC not submitted or not yet included in analysis",
|
54 |
+
1:"Reduction of non CO2 gases not indicated",
|
55 |
+
2:"Efforts should be made in reduction of NOn CO2 gases too."}},
|
56 |
+
|
57 |
+
|
58 |
+
"base_year": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
59 |
+
1: "No base year",
|
60 |
+
2: "the base year or reference point for measurement of emissions is year 19XX"}},
|
61 |
+
|
62 |
+
"carbon_capture_and_storage": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
63 |
+
1: "carbon capture and storage not indicated",
|
64 |
+
2:"With Technology advancement the mitigation efforts can also in form of carbon capture and storage.",
|
65 |
+
3: "With technological advancement the mitigation efforts can also be in form of carbon capture and storage. This should be a focus area and more options need to be explored to do carbon capture and storage."}},
|
66 |
+
|
67 |
+
"costs_of_ccm": {"category": "climate change mitigation","id":{ 0: "(I)NDC not submitted or not yet included in analysis",
|
68 |
+
1: "(partial) costs not indicated",
|
69 |
+
2: " the mitigation actions and efforts will cost 0-1 billion US$ until 2030",
|
70 |
+
3:"the mitigation actions and efforts will cost 1-5 billion US$ until 2030",
|
71 |
+
4:"the mitigation actions and efforts will cost5-10 billion US$ until 2030",
|
72 |
+
5: "the mitigation actions and efforts will cost 10-20 billion US$ until 2030",
|
73 |
+
6:"the mitigation actions and efforts will cost will be more than 20 billion US$ until 2030"}},
|
74 |
+
|
75 |
+
"market_mechanisms": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
76 |
+
1: "International market mechanisms not mentioned",
|
77 |
+
2:"One good mechanism to deal with greenhouse gas emissions is to explore International market mechanisms",
|
78 |
+
3: "International market mechanisms are not a good way of dealing with mitigation ambitions and therefore should not be considered. Greenhouse gas emissions cannot be part of tradable commodity.",
|
79 |
+
4: "Carbon emissions of greenhouse gas emissions are now a tradable commodity and these can provide a good source for funds and achieving mitigation ambitions. Therefore it is important to exploreInternational market mechanisms. It is important that such means should be explored and there will be plan of actions soon to include these in meeting mitigations target",
|
80 |
+
5: "Carbon emissions of greenhouse gas emissions are now a tradable commodity and these can provide a good source for funds and achieving mitigation ambitions. Therefore it is important to exploreInternational market mechanisms. It is important that such means should be explored and there will be plan of actions soon to include these in meeting mitigations target"}},
|
81 |
+
|
82 |
+
"redd": {"category": "climate change mitigation","id":{ 0:"(I)NDC not submitted or not yet included in analysis",
|
83 |
+
1: "REDD+ not mentioned",
|
84 |
+
2: "Reducing Emissions of Deforestation and Forest Degradation/REDD+",
|
85 |
+
3: "Reducing Emissions of Deforestation and Forest Degradation/REDD+"}},
|
86 |
+
}
|
docStore/ndcs/countryList.txt
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{'Afghanistan': 'AFG',
|
2 |
+
'Albania': 'ALB',
|
3 |
+
'Algeria': 'DZA',
|
4 |
+
'Andorra': 'AND',
|
5 |
+
'Angola': 'AGO',
|
6 |
+
'Antigua and Barbuda': 'ATG',
|
7 |
+
'Argentina': 'ARG',
|
8 |
+
'Armenia': 'ARM',
|
9 |
+
'Australia': 'AUS',
|
10 |
+
'Azerbaijan': 'AZE',
|
11 |
+
'Bahamas': 'BHS',
|
12 |
+
'Bahrain': 'BHR',
|
13 |
+
'Bangladesh': 'BGD',
|
14 |
+
'Barbados': 'BRB',
|
15 |
+
'Belarus': 'BLR',
|
16 |
+
'Belize': 'BLZ',
|
17 |
+
'Benin': 'BEN',
|
18 |
+
'Bhutan': 'BTN',
|
19 |
+
'Bolivia': 'BOL',
|
20 |
+
'Bosnia and Herzegovina': 'BIH',
|
21 |
+
'Botswana': 'BWA',
|
22 |
+
'Brazil ': 'BRA',
|
23 |
+
'Brunei Darussalam': 'BRN',
|
24 |
+
'Burkina Faso': 'BFA',
|
25 |
+
'Burundi ': 'BDI',
|
26 |
+
'Cabo Verde': 'CPV',
|
27 |
+
'Cambodia': 'KHM',
|
28 |
+
'Cameroon': 'CMR',
|
29 |
+
'Canada': 'CAN',
|
30 |
+
'Central African Republic': 'CAF',
|
31 |
+
'Chad': 'TCD',
|
32 |
+
'Chile': 'CHL',
|
33 |
+
'China': 'CHN',
|
34 |
+
'Colombia': 'COL',
|
35 |
+
'Comoros': 'COM',
|
36 |
+
'Congo': 'COG',
|
37 |
+
'Cook Islands': 'COK',
|
38 |
+
'Costa Rica': 'CRI',
|
39 |
+
'Cote dIvoire': 'CIV',
|
40 |
+
'Cuba': 'CUB',
|
41 |
+
"Democratic People's Republic of Korea": 'PRK',
|
42 |
+
'Democratic Republic of Congo': 'COD',
|
43 |
+
'Djibouti': 'DJI',
|
44 |
+
'Dominica': 'DMA',
|
45 |
+
'Dominican Republic': 'DOM',
|
46 |
+
'Ecuador': 'ECU',
|
47 |
+
'Egypt': 'EGY',
|
48 |
+
'El Salvador': 'SLV',
|
49 |
+
'Equatorial Guinea': 'GNQ',
|
50 |
+
'Eritrea': 'ERI',
|
51 |
+
'Ethiopia': 'ETH',
|
52 |
+
'European Union': 'EU',
|
53 |
+
'Fiji': 'FJI',
|
54 |
+
'Gabon': 'GAB',
|
55 |
+
'Gambia': 'GMB',
|
56 |
+
'Georgia': 'GEO',
|
57 |
+
'Ghana': 'GHA',
|
58 |
+
'Grenada': 'GRD',
|
59 |
+
'Guatemala': 'GTM',
|
60 |
+
'Guinea': 'GIN',
|
61 |
+
'Guinea Bissau': 'GNB',
|
62 |
+
'Guyana': 'GUY',
|
63 |
+
'Haiti': 'HTI',
|
64 |
+
'Honduras': 'HND',
|
65 |
+
'Iceland': 'ISL',
|
66 |
+
'India': 'IND',
|
67 |
+
'Indonesia': 'IDN',
|
68 |
+
'Iran': 'IRN',
|
69 |
+
'Iraq': 'IRQ',
|
70 |
+
'Israel': 'ISR',
|
71 |
+
'Jamaica': 'JAM',
|
72 |
+
'Japan': 'JPN',
|
73 |
+
'Jordan': 'JOR',
|
74 |
+
'Kazakhstan': 'KAZ',
|
75 |
+
'Kenya': 'KEN',
|
76 |
+
'Kingdom of Eswatini': 'SWZ',
|
77 |
+
'Kiribati': 'KIR',
|
78 |
+
'Kuwait': 'KWT',
|
79 |
+
'Kyrgyzstan': 'KGZ',
|
80 |
+
'Lao Peoples Democratic Republic': 'LAO',
|
81 |
+
'Lebanon': 'LBN',
|
82 |
+
'Lesotho': 'LSO',
|
83 |
+
'Liberia': 'LBR',
|
84 |
+
'Libya': 'LBY',
|
85 |
+
'Liechtenstein': 'LIE',
|
86 |
+
'Madagascar': 'MDG',
|
87 |
+
'Malawi': 'MWI',
|
88 |
+
'Malaysia': 'MYS',
|
89 |
+
'Maldives': 'MDV',
|
90 |
+
'Mali': 'MLI',
|
91 |
+
'Marshall Islands': 'MHL',
|
92 |
+
'Mauritania': 'MRT',
|
93 |
+
'Mauritius': 'MUS',
|
94 |
+
'Mexico': 'MEX',
|
95 |
+
'Micronesia': 'FSM',
|
96 |
+
'Monaco': 'MCO',
|
97 |
+
'Mongolia': 'MNG',
|
98 |
+
'Montenegro': 'MNE',
|
99 |
+
'Morocco': 'MAR',
|
100 |
+
'Mozambique': 'MOZ',
|
101 |
+
'Myanmar': 'MMR',
|
102 |
+
'Namibia': 'NAM',
|
103 |
+
'Nauru': 'NRU',
|
104 |
+
'Nepal': 'NPL',
|
105 |
+
'New Zealand': 'NZL',
|
106 |
+
'Nicaragua': 'NIC',
|
107 |
+
'Niger': 'NER',
|
108 |
+
'Nigeria': 'NGA',
|
109 |
+
'Niue': 'NIU',
|
110 |
+
'Norway': 'NOR',
|
111 |
+
'Oman': 'OMN',
|
112 |
+
'Pakistan': 'PAK',
|
113 |
+
'Palau ': 'PLW',
|
114 |
+
'Palestine': 'PSE',
|
115 |
+
'Panama': 'PAN',
|
116 |
+
'Papua New Guinea': 'PNG',
|
117 |
+
'Paraguay': 'PRY',
|
118 |
+
'Peru': 'PER',
|
119 |
+
'Philippines': 'PHL',
|
120 |
+
'Qatar': 'QAT',
|
121 |
+
'Republic of Moldova': 'MDA',
|
122 |
+
'Republic of North Macedonia': 'MKD',
|
123 |
+
'Russian Federation': 'RUS',
|
124 |
+
'Rwanda': 'RWA',
|
125 |
+
'Saint Kitts and Nevis': 'KNA',
|
126 |
+
'Saint Lucia': 'LCA',
|
127 |
+
'Saint Vincent and the Grenadines': 'VCT',
|
128 |
+
'Samoa': 'WSM',
|
129 |
+
'San Marino': 'SMR',
|
130 |
+
'Sao Tome and Principe': 'STP',
|
131 |
+
'Saudi Arabia': 'SAU',
|
132 |
+
'Senegal': 'SEN',
|
133 |
+
'Serbia': 'SRB',
|
134 |
+
'Seychelles': 'SYC',
|
135 |
+
'Sierra Leone': 'SLE',
|
136 |
+
'Singapore': 'SGP',
|
137 |
+
'Solomon Islands': 'SLB',
|
138 |
+
'Somalia': 'SOM',
|
139 |
+
'South Africa': 'ZAF',
|
140 |
+
'South Korea': 'KOR',
|
141 |
+
'South Sudan': 'SSD',
|
142 |
+
'Sri Lanka': 'LKA',
|
143 |
+
'Sudan': 'SDN',
|
144 |
+
'Suriname': 'SUR',
|
145 |
+
'Switzerland': 'CHE',
|
146 |
+
'Syria': 'SYR',
|
147 |
+
'Tajikistan': 'TJK',
|
148 |
+
'Thailand': 'THA',
|
149 |
+
'Timor Leste': 'TLS',
|
150 |
+
'Togo': 'TGO',
|
151 |
+
'Tonga': 'TON',
|
152 |
+
'Trinidad and Tobago': 'TTO',
|
153 |
+
'Tunisia': 'TUN',
|
154 |
+
'Turkey': 'TUR',
|
155 |
+
'Turkmenistan': 'TKM',
|
156 |
+
'Tuvalu': 'TUV',
|
157 |
+
'Uganda': 'UGA',
|
158 |
+
'Ukraine': 'UKR',
|
159 |
+
'United Arab Emirates': 'ARE',
|
160 |
+
'United Kingdom': 'GBR',
|
161 |
+
'United Republic of Tanzania': 'TZA',
|
162 |
+
'United States of America': 'USA',
|
163 |
+
'Uruguay': 'URY',
|
164 |
+
'Uzbekistan': 'UZB',
|
165 |
+
'Vanuatu': 'VUT',
|
166 |
+
'Venezuela ': 'VEN',
|
167 |
+
'Vietnam': 'VNM',
|
168 |
+
'Yemen': 'YEM',
|
169 |
+
'Zambia': 'ZMB',
|
170 |
+
'Zimbabwe': 'ZWE'}
|
docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt
ADDED
@@ -0,0 +1,737 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Ethiopia 2030: The Pathway to Prosperity
|
2 |
+
Ten Years Perspective Development Plan (2021 � 2030)
|
3 |
+
1. Baselines and Assumptions
|
4 |
+
2. Strategic pillars
|
5 |
+
3. Departures
|
6 |
+
4. Macroeconomic goals
|
7 |
+
5. Implications of the COVID-19 pandemic and necessary mitigation measures
|
8 |
+
6. Potentials/capabilities
|
9 |
+
7. Focus areas
|
10 |
+
7.1. Productive sectors
|
11 |
+
7.2. Services sector
|
12 |
+
7.3. Enabling sectors
|
13 |
+
8. Balanced and competitive development (nationally, regionally and locally)
|
14 |
+
9. Monitoring and Evaluation
|
15 |
+
Content
|
16 |
+
1. Baselines and Assumptions
|
17 |
+
Poverty Reduction (%)
|
18 |
+
Key performances of previous years
|
19 |
+
45.5 44.2
|
20 |
+
38.7
|
21 |
+
29.6
|
22 |
+
23.5
|
23 |
+
19
|
24 |
+
0
|
25 |
+
5
|
26 |
+
10
|
27 |
+
15
|
28 |
+
20
|
29 |
+
25
|
30 |
+
30
|
31 |
+
35
|
32 |
+
40
|
33 |
+
45
|
34 |
+
50
|
35 |
+
1994 2000 2005 2011 2016 2020
|
36 |
+
Percent
|
37 |
+
Year
|
38 |
+
Proportion of people living below poverty line
|
39 |
+
10.5
|
40 |
+
8.8
|
41 |
+
10.1
|
42 |
+
7.7
|
43 |
+
9
|
44 |
+
5.19-6.20
|
45 |
+
0 2 4 6 8 10 12
|
46 |
+
GTP I: 2011-2015
|
47 |
+
GTP II: 2015/16
|
48 |
+
GTP II: 2016/17
|
49 |
+
GTP II: 2017/18
|
50 |
+
GTP II: 2018/19
|
51 |
+
GTP II: 2019/20 (projection, with
|
52 |
+
COVID-19)
|
53 |
+
GDP growth rate (%)
|
54 |
+
1. Baselines and Assumptions
|
55 |
+
Share of economic sectors in GDP (%) Merchandise export as % of GDP
|
56 |
+
8.66
|
57 |
+
7.33
|
58 |
+
6.57
|
59 |
+
5.93
|
60 |
+
4.91
|
61 |
+
3.86 3.56 3.37
|
62 |
+
2.77
|
63 |
+
0
|
64 |
+
1
|
65 |
+
2
|
66 |
+
3
|
67 |
+
4
|
68 |
+
5
|
69 |
+
6
|
70 |
+
7
|
71 |
+
8
|
72 |
+
9
|
73 |
+
10
|
74 |
+
Percent
|
75 |
+
Year
|
76 |
+
46.9
|
77 |
+
45
|
78 |
+
43.5
|
79 |
+
41.4
|
80 |
+
39.5
|
81 |
+
37.1 35.9
|
82 |
+
34.5
|
83 |
+
32.8
|
84 |
+
13.4
|
85 |
+
15
|
86 |
+
17.3
|
87 |
+
18.8
|
88 |
+
21
|
89 |
+
23.5
|
90 |
+
25.7 26.9 27.8
|
91 |
+
4.7 4.8 5 5.3 5.6 6.1 6.9 6.8 6.8
|
92 |
+
7.1
|
93 |
+
8.6
|
94 |
+
10.7 12
|
95 |
+
14.2
|
96 |
+
16.2
|
97 |
+
17.8 19.1 20.1
|
98 |
+
39.8 40.1 39.2 39.8 39.4 38.4 38.6 39.4
|
99 |
+
0
|
100 |
+
5
|
101 |
+
10
|
102 |
+
15
|
103 |
+
20
|
104 |
+
25
|
105 |
+
30
|
106 |
+
35
|
107 |
+
40
|
108 |
+
45
|
109 |
+
50
|
110 |
+
2010/11 2011/12 2012/13 2013/14 2014/15 2015/16 2016/17 2017/18 2018/19
|
111 |
+
Percent
|
112 |
+
Agriculture Industry Manufacturing Construction Services
|
113 |
+
1. Baselines and Assumptions
|
114 |
+
Labour force participation (2013)
|
115 |
+
73%
|
116 |
+
7%
|
117 |
+
20%
|
118 |
+
Agriculture
|
119 |
+
Industry
|
120 |
+
Services
|
121 |
+
7%
|
122 |
+
22%
|
123 |
+
71%
|
124 |
+
Agriculture
|
125 |
+
Industry
|
126 |
+
Services
|
127 |
+
Urban labour force participation (2013)
|
128 |
+
1. Baselines and Assumptions
|
129 |
+
High and increasing Unemployment Rate
|
130 |
+
� Urban unemployment rate = 19.1% in 2018
|
131 |
+
� Youth unemployment rate = 25.3 %
|
132 |
+
? Male = 18.6%
|
133 |
+
? Female 30.9 %
|
134 |
+
� Rural unemployment rate = 2% in 2013
|
135 |
+
� Declining per capita rural land creating
|
136 |
+
disguised unemployment
|
137 |
+
402,869
|
138 |
+
471,535
|
139 |
+
Male Female Total Male Female Total
|
140 |
+
2014 2018
|
141 |
+
15-19 yr. 20-24 yr. 25-29 yr. Linear (20-24 yr.)
|
142 |
+
Number of unemployed people in urban areas
|
143 |
+
1. Baselines and Assumptions
|
144 |
+
Challenges
|
145 |
+
1. Macroeconomic imbalances
|
146 |
+
?Sustained high inflation
|
147 |
+
?High and rising unemployment especially
|
148 |
+
in urban areas
|
149 |
+
?High and rising debt burden
|
150 |
+
?Chronic foreign currency shortage
|
151 |
+
?Sluggish (though encouraging) rate of
|
152 |
+
structural change
|
153 |
+
2. Vulnerability to shocks (COVID-19, Climate
|
154 |
+
changes, Desert Locust infestation, etc)
|
155 |
+
3. Poor quality and high inequity in
|
156 |
+
infrastructure projects
|
157 |
+
4. Poor quality services in health and
|
158 |
+
education
|
159 |
+
� High repetition and dropout rates from school
|
160 |
+
1. Baselines and Assumptions
|
161 |
+
� Poor quality of growth and slow
|
162 |
+
structural change
|
163 |
+
� Excessive aid and loan
|
164 |
+
dependence for financing
|
165 |
+
infrastructural and construction
|
166 |
+
investments
|
167 |
+
� Limited success in expanding
|
168 |
+
manufacturing and modern
|
169 |
+
agriculture which have high job
|
170 |
+
creation potentials
|
171 |
+
� Weak institutional capacity as
|
172 |
+
the main culprit of all failures
|
173 |
+
? Provision of quality services
|
174 |
+
(electricity, water, telephone,
|
175 |
+
internet)
|
176 |
+
? Creation of enough jobs and
|
177 |
+
improved living standards
|
178 |
+
? Generation of reliable foreign
|
179 |
+
exchange revenue and debtsustainable
|
180 |
+
national economic
|
181 |
+
capacity
|
182 |
+
? Completion of development
|
183 |
+
projects and investment plans
|
184 |
+
under public-private
|
185 |
+
partnerships
|
186 |
+
� Low reward for merit, productivity and effort
|
187 |
+
while low disincentive for laziness, wastefulness
|
188 |
+
and corruption
|
189 |
+
� Slow institutional change and transformation in:
|
190 |
+
? Government policies
|
191 |
+
? Investor attitude
|
192 |
+
? Youth behaviour
|
193 |
+
? Role of the intellectuals
|
194 |
+
� The need for sustained increase in production
|
195 |
+
and productivity
|
196 |
+
� The need to set a common national vision to
|
197 |
+
achieve major successes with consensus and
|
198 |
+
popular legitimacy
|
199 |
+
Major areas of failure in the economy
|
200 |
+
1. Baselines and Assumptions
|
201 |
+
� Poor quality of growth and slow
|
202 |
+
structural change
|
203 |
+
� Excessive aid and loan
|
204 |
+
dependence for financing
|
205 |
+
infrastructural and construction
|
206 |
+
investments
|
207 |
+
� Limited success in expanding
|
208 |
+
manufacturing and modern
|
209 |
+
agriculture which have high job
|
210 |
+
creation potentials
|
211 |
+
� Weak institutional capacity as
|
212 |
+
the main culprit of all failures
|
213 |
+
? Provision of quality services
|
214 |
+
(electricity, water, telephone,
|
215 |
+
internet)
|
216 |
+
? Creation of enough jobs and
|
217 |
+
improved living standards
|
218 |
+
? Generation of reliable foreign
|
219 |
+
exchange revenue and debtsustainable
|
220 |
+
national economic
|
221 |
+
capacity
|
222 |
+
? Completion of development
|
223 |
+
projects and investment plans
|
224 |
+
under public-private
|
225 |
+
partnerships
|
226 |
+
� Low reward for merit, productivity and effort
|
227 |
+
while low disincentive for laziness, wastefulness
|
228 |
+
and corruption
|
229 |
+
� Slow institutional change and transformation in:
|
230 |
+
? Government policies
|
231 |
+
? Investor attitude
|
232 |
+
? Youth behaviour
|
233 |
+
? Role of the intellectuals
|
234 |
+
� The need for sustained increase in production
|
235 |
+
and productivity
|
236 |
+
� The need to set a common national vision to
|
237 |
+
achieve major successes with consensus and
|
238 |
+
popular legitimacy
|
239 |
+
Major areas of failure in the economy
|
240 |
+
2. Departures
|
241 |
+
1. Emphasis on quality of economic growth
|
242 |
+
2. Participation and coordination of sectors in the planning process
|
243 |
+
3. Sectoral linkages and multi-sectoral development focus
|
244 |
+
4. Preparation of national development corridors based on development potentials
|
245 |
+
5. Focus on solving institutional bottlenecks
|
246 |
+
6. The ongoing home grown economic reform programme as a sprinting board
|
247 |
+
7. Emphasis on resilience building, innovation and entrepreneurship
|
248 |
+
3. Strategic pillars
|
249 |
+
1. Ensure quality growth
|
250 |
+
2. Improve productivity and competitiveness
|
251 |
+
3. Undertake institutional transformation
|
252 |
+
4. Ensure private sector's leadership in the economy
|
253 |
+
5. Ensure equitable participation of women and children
|
254 |
+
6. Build climate resilient green economy
|
255 |
+
3. Strategic pillars
|
256 |
+
� Increasing export revenues and substituting imports by
|
257 |
+
reducing production costs
|
258 |
+
� Availing quality and massive infrastructure
|
259 |
+
? Linking infrastructural development with development corridors
|
260 |
+
� Producing required human resources with quality
|
261 |
+
� Producing enough and quality human resources
|
262 |
+
� Prioritizing innovative production systems
|
263 |
+
� Linking incentives with export revenue and job creation
|
264 |
+
performances
|
265 |
+
� Modernizing and enhancing the logistic system
|
266 |
+
� Creating technological competences needed for longterm
|
267 |
+
growth
|
268 |
+
� The economic growth should ensure:
|
269 |
+
? Participation of all citizens and equitable utilization of the
|
270 |
+
growth proceeds
|
271 |
+
? Improved standard of living of every citizen
|
272 |
+
? Reduced poverty in all indicators
|
273 |
+
? Reduced inflation and unemployment
|
274 |
+
� The economic growth should lead to increased
|
275 |
+
aggregate supply
|
276 |
+
� Focus on modern agriculture, manufacturing and
|
277 |
+
mining
|
278 |
+
� Emphasis on exploiting the sources of growth through
|
279 |
+
structural change
|
280 |
+
1.Ensuring quality economic growth 2. Raising production and productivity
|
281 |
+
3. Strategic pillars
|
282 |
+
� Build democratic and judicial institutions that ensure elite bargain,
|
283 |
+
national consensus, common vision and government legitimacy
|
284 |
+
� Build private sector and competition friendly bureaucracy
|
285 |
+
� Coordinate with parents, the society and teachers to make
|
286 |
+
educational institutions centers of excellence and virtuous citizens
|
287 |
+
� Coordinate with parents as well as social and religious leaders to
|
288 |
+
encourage religious institutions and their teachings contribute
|
289 |
+
towards poverty reduction efforts
|
290 |
+
� Prepare policies, strategies and legal frameworks for achieving
|
291 |
+
prosperity
|
292 |
+
� Increased focus on innovation and research
|
293 |
+
� Creating strong social security system
|
294 |
+
3. Institutional Transformation 4. Private sector's leadership in the economy
|
295 |
+
� Create conducive investment climate and incentivize
|
296 |
+
domestic investors in key sectors
|
297 |
+
� Build strong and market-led public-private partnerships in
|
298 |
+
order to ensure the establishment of inclusive and
|
299 |
+
pragmatic market economy
|
300 |
+
� Enhance access and quality of infrastructure to attract
|
301 |
+
quality foreign direct investment
|
302 |
+
� Identify new sources of growth, empower and stimulate
|
303 |
+
the private sector, and supplement the private sector in
|
304 |
+
strategic areas
|
305 |
+
� Emphasis for public-private partnership on problem
|
306 |
+
solving innovations and research activities
|
307 |
+
3. Strategic pillars
|
308 |
+
� Ensure gender equity in economic and social
|
309 |
+
sectors
|
310 |
+
? Participation of women at all levels of education
|
311 |
+
? Asset ownership of women
|
312 |
+
� Ensure fair participation of women and youth in
|
313 |
+
leadership and decision making positions
|
314 |
+
� Create awareness among citizens about the role of
|
315 |
+
women and youth in the country�s overall
|
316 |
+
development
|
317 |
+
� Increase basin development efforts to fight land
|
318 |
+
degradation and to reduce pollutions
|
319 |
+
� Improve productivity and reduce GHG emissions
|
320 |
+
� Increase forest protection and development
|
321 |
+
� Increase production of electricity from renewable
|
322 |
+
sources for domestic use and for export
|
323 |
+
� Focus on modern and energy saving technologies
|
324 |
+
5. Equitable participation of women and children 6. Climate resilient green economy
|
325 |
+
4. Macroeconomic Goals
|
326 |
+
Assumptions
|
327 |
+
? Requirement to significantly reduce
|
328 |
+
poverty
|
329 |
+
? Available national potentials
|
330 |
+
? Potential for investment in the economy
|
331 |
+
? Existing potentials in each sector
|
332 |
+
? Low productivity that needs to be
|
333 |
+
improved
|
334 |
+
� Make Ethiopia a middle income
|
335 |
+
economy by 2022
|
336 |
+
� Raise per capita income to USD 1,115
|
337 |
+
in 2022
|
338 |
+
? Threshold for middle-income is USD 1,026
|
339 |
+
? Plus human development index and
|
340 |
+
economic vulnerability index
|
341 |
+
� Raise per capita income to USD 2,220
|
342 |
+
by 2030
|
343 |
+
Sectoral growth Targets (2021-2030)
|
344 |
+
Assured middle- income potential
|
345 |
+
10.2%
|
346 |
+
Average
|
347 |
+
Growth
|
348 |
+
Target
|
349 |
+
Percentage of population below poverty line
|
350 |
+
4. Macroeconomic Goals
|
351 |
+
Structural change
|
352 |
+
Financing Gaps
|
353 |
+
Reduce urban unemployment to less than 9%
|
354 |
+
?1.36 million new jobs need to be
|
355 |
+
created per annum
|
356 |
+
Sectoral composition of GDP Labour force participation
|
357 |
+
Economic
|
358 |
+
Sectors
|
359 |
+
Performance Target
|
360 |
+
2011 2015 2018/19 2030
|
361 |
+
Agriculture 45 39.7 32.8 22.0
|
362 |
+
Industry 15.1 21.2 27.6 35.9
|
363 |
+
Manufacturing 4.7 5.5 6.8 17.2
|
364 |
+
Services 39.9 39 39.4 42.1
|
365 |
+
5. Implications of the COVID-19 pandemic and necessary mitigation measures
|
366 |
+
� GDP growth for 2019/20 fiscal year is projected to be lower than its target of 9.0% by between 2.81
|
367 |
+
and 3.80 percentage points (equivalent to 58.3 - 78.8 billion birr) due to COVID-19 pandemic
|
368 |
+
� If the current scenario continues, next year�s GDP growth could decline by 2.8 percentage points
|
369 |
+
� Returning the economy to its high growth trajectory requires focusing on sectors with high
|
370 |
+
productivity and job creation potentials
|
371 |
+
� Public investment should focus on empowering the private sector
|
372 |
+
� Promoting both domestic and foreign investments with the right set of incentives (merit based)
|
373 |
+
� Modernizing production systems and improving uptake of technology
|
374 |
+
� Conducting demand analysis for export commodities to remedy for the declining trend in exports
|
375 |
+
and foreign exchange earnings.
|
376 |
+
6. Potentials
|
377 |
+
� Endowment of various natural resources contributing to the growth potential
|
378 |
+
� Huge unutilized arable land creates great potential for the success of the plan
|
379 |
+
� Endowment of gemstones, ornamental, energy, metals, and metallic minerals
|
380 |
+
� Gold, coal, iron ore, potash, tantalum, marble, petroleum and other natural resources
|
381 |
+
Natural
|
382 |
+
Resources
|
383 |
+
� Large youth population and potential for demographic dividend
|
384 |
+
� Cumulative capacity in education and health
|
385 |
+
� Positive attitude and noble culture of reaching agreement among citizens
|
386 |
+
Human
|
387 |
+
capital
|
388 |
+
6. Potentials
|
389 |
+
Built physical and material capitals
|
390 |
+
?Transport and communication
|
391 |
+
? Irrigation infrastructures for modern agriculture
|
392 |
+
?Industrial Parks
|
393 |
+
?Mega energy infrastructures
|
394 |
+
Physical
|
395 |
+
capital
|
396 |
+
Unexploited
|
397 |
+
growth
|
398 |
+
potentials
|
399 |
+
� Utilizing the tourism potential through modernization
|
400 |
+
� Using the mining subsector as a source of input as well as a competitive industry in its
|
401 |
+
own right
|
402 |
+
6. Potentials
|
403 |
+
� Solving supply side bottlenecks to satisfy the existing demand
|
404 |
+
� Improving international acceptance and reliable partnerships
|
405 |
+
? The �medemer�/synergy philosophy
|
406 |
+
? The ongoing political reform measures
|
407 |
+
? The Homegrown Economic Reform programme
|
408 |
+
� Increased finance from partners and multilateral institutions
|
409 |
+
? Increased availability of foreign exchange
|
410 |
+
? Reduced debt stress for the short to medium term
|
411 |
+
? Increased potential for development
|
412 |
+
Increased
|
413 |
+
demand as
|
414 |
+
potential
|
415 |
+
Political Capital
|
416 |
+
Continental
|
417 |
+
and regional
|
418 |
+
integrations
|
419 |
+
� Regional and continental economic integration agreements
|
420 |
+
� International and continental free trade agreements
|
421 |
+
6. Potentials
|
422 |
+
Low
|
423 |
+
technology as
|
424 |
+
a potential
|
425 |
+
� Undeniably low status of technological development
|
426 |
+
� International mobility and spillover effect of technology
|
427 |
+
� Potential for development and catching up by filling the technological gaps
|
428 |
+
� Doubling crop productivity from the current 24-36 quintals per hectare will result
|
429 |
+
in 7% increase in crop production
|
430 |
+
� Raise the production efficiency of manufacturing from the current 50% to 80%
|
431 |
+
7. Focus Areas
|
432 |
+
7.1. Productive sectors: agriculture, manufacturing, mining
|
433 |
+
7.2. Service sector: tourism
|
434 |
+
7.3. Enabling sectors: energy, transport, sustainable finance,
|
435 |
+
innovation and technology, urban development, irrigation,
|
436 |
+
human capital development
|
437 |
+
7.1. Productive sectors
|
438 |
+
Agriculture Objectives
|
439 |
+
1. Free agriculture from rain dependence
|
440 |
+
2. Agricultural mechanization services
|
441 |
+
3. Contract farming, cluster approach and
|
442 |
+
land consolidation
|
443 |
+
4. Livestock, animal feed and animal health
|
444 |
+
5. Horticulture (irrigation and urban farming)
|
445 |
+
6. Private sector participation
|
446 |
+
7. Institutional implementation capacity
|
447 |
+
8. Climate resilient sustainable agricultural
|
448 |
+
development
|
449 |
+
1. Improve income and livelihood options for farming and pastoral
|
450 |
+
communities through increased productivity and competitiveness
|
451 |
+
2. Modernize agriculture and ensure national food and nutrition security
|
452 |
+
3. Raise export of agricultural output and substitute imports
|
453 |
+
4. Make agriculture a viable and profitable enterprise through value addition
|
454 |
+
5. Create rural employment opportunities
|
455 |
+
6. Enhance livestock health access and quality
|
456 |
+
7. Preserve animal genetic resources and increase pastoral research
|
457 |
+
8. Improve the development of animal feed and access to markets
|
458 |
+
9. Develop livestock specific extension package for each livestock type
|
459 |
+
Focus Areas
|
460 |
+
7.1. Productive sector
|
461 |
+
Manufacturing Industry
|
462 |
+
Objectives
|
463 |
+
1. Production of quality and competitive food, textile, housing and
|
464 |
+
pharmaceutical products for export and domestic markets
|
465 |
+
2. Production and productivity of existing manufacturing industries
|
466 |
+
3. Utilization of locally available inputs
|
467 |
+
4. Value chains, linkages and interdependencies
|
468 |
+
5. Linkages between large scale metallurgical and engineering,
|
469 |
+
chemical and pharmaceutical industries with other industries
|
470 |
+
6. Job creation, cluster approaches and expanding small and medium
|
471 |
+
scale manufacturing
|
472 |
+
7. Private sector participation and partnership
|
473 |
+
1. Establish basis for domestic industrialization
|
474 |
+
2. Value addition through enhanced inter-sectoral
|
475 |
+
linkages
|
476 |
+
3. Enhance productivity through private sector
|
477 |
+
leadership and supportive role of the
|
478 |
+
government
|
479 |
+
? Create job opportunities for the youth leaving
|
480 |
+
agriculture and concentrating in urban areas
|
481 |
+
? Make exportable commodities internationally
|
482 |
+
competitive
|
483 |
+
? Ensure structural change
|
484 |
+
Focus areas
|
485 |
+
7.1. Productive sectors
|
486 |
+
Mining
|
487 |
+
Objectives
|
488 |
+
� Foreign exchange earning and
|
489 |
+
domestic revenues
|
490 |
+
� Increased investment in mining
|
491 |
+
� Participation of manufacturing
|
492 |
+
industries that add value
|
493 |
+
� Job creation
|
494 |
+
� Add value for improved contribution of the subsector
|
495 |
+
� Increase inter-sectoral linkages to raise raw material inputs to other
|
496 |
+
sectors
|
497 |
+
� Make mining a competent subsector and induce structural change
|
498 |
+
� Increase human resource and technological capabilities through
|
499 |
+
research and trainings
|
500 |
+
� Raise foreign exchange revenue from mining through increased
|
501 |
+
exploration and production
|
502 |
+
� Improve traditional mining production and marketing systems
|
503 |
+
� Improve the country�s geological information
|
504 |
+
Focus areas
|
505 |
+
7.2. Service sector
|
506 |
+
Tourism
|
507 |
+
Objectives
|
508 |
+
� Identification and developing destinations
|
509 |
+
� Infrastructure
|
510 |
+
� Competitiveness
|
511 |
+
?improve existing destinations
|
512 |
+
?develop new destinations
|
513 |
+
? diversify service and raise quality
|
514 |
+
� Market linkages, branding, and promotion
|
515 |
+
� Technology, research and development
|
516 |
+
� Preservation, maintenance and proper
|
517 |
+
utilization of heritage resources
|
518 |
+
� Expand job opportunities
|
519 |
+
� Raise incomes
|
520 |
+
� Build information management
|
521 |
+
systems
|
522 |
+
� Increase implementation capacity
|
523 |
+
Focus areas
|
524 |
+
7.3. Enabling sectors
|
525 |
+
Urban development
|
526 |
+
Objectives
|
527 |
+
? Prioritize productive sectors in job creation and enterprise
|
528 |
+
development plans
|
529 |
+
? Rapid development and equity goals in land provision system
|
530 |
+
? Participation of indigenous people in land redevelopment and
|
531 |
+
expansion
|
532 |
+
? Urban land registration and cadaster system, modern
|
533 |
+
property valuation
|
534 |
+
? Greenery and public spaces as well as waste disposal and
|
535 |
+
management in urban planning and implementation
|
536 |
+
? Housing development and financing options to reduce
|
537 |
+
housing shortages
|
538 |
+
? Integrated infrastructure and services provision
|
539 |
+
? Role of private sector in infrastructure development and
|
540 |
+
service provision
|
541 |
+
� Expand micro and small-scale
|
542 |
+
enterprises to reduce urban
|
543 |
+
unemployment
|
544 |
+
� Develop and avail urban land based on
|
545 |
+
demand, equity and cost effectiveness
|
546 |
+
� Make quality housing accessible both in
|
547 |
+
rural and urban areas
|
548 |
+
� Develop quality and integrated
|
549 |
+
infrastructure as well as service
|
550 |
+
provision in towns
|
551 |
+
� Improve financial management and
|
552 |
+
resource utilization in urban areas
|
553 |
+
Focus areas
|
554 |
+
7.3. Enabling sectors
|
555 |
+
Innovation and Technology
|
556 |
+
Objectives
|
557 |
+
? Access to innovation and
|
558 |
+
technological information
|
559 |
+
? Developing a digital economy
|
560 |
+
? Productivity enhancement and
|
561 |
+
competitiveness
|
562 |
+
? Build a digital economy
|
563 |
+
? Develop national scientific research and technological
|
564 |
+
capabilities
|
565 |
+
? Support problem solving research and development of
|
566 |
+
technologies necessary for raising production,
|
567 |
+
productivity and service provision
|
568 |
+
? Create jobs and capital that are based on technology
|
569 |
+
? Develop technological and data security protection
|
570 |
+
systems
|
571 |
+
Focus areas
|
572 |
+
7.3. Enabling sectors
|
573 |
+
Sustainable finance
|
574 |
+
Objectives
|
575 |
+
� Access to modern finance and saving culture in rural
|
576 |
+
areas
|
577 |
+
� Support to the private sector and corporations to
|
578 |
+
reinvest profits in productive sectors
|
579 |
+
� Role of private financial institutions in manufacturing
|
580 |
+
and agriculture
|
581 |
+
� Digital revenue collection system
|
582 |
+
� Tax equity (contraband, tax evasion, and bringing the
|
583 |
+
underground economy to the tax system)
|
584 |
+
� Domestic and foreign strategic partnerships
|
585 |
+
� Transform financing from short term to long-term,
|
586 |
+
sustainable and quality sources
|
587 |
+
� Ensure financing quality based on sectoral prioritization
|
588 |
+
and reduction of wastage
|
589 |
+
� Increase the number of domestic saving institutions both
|
590 |
+
in rural and urban areas
|
591 |
+
� Support domestic finance with foreign exchange capacity
|
592 |
+
and foreign direct investment
|
593 |
+
� Modernize domestic revenue collection system
|
594 |
+
� Raise voluntary tax payment attitude
|
595 |
+
� Bring the informal sector to the formal tax system
|
596 |
+
Focus areas
|
597 |
+
7.3. Enabling sectors
|
598 |
+
Transport
|
599 |
+
Objectives
|
600 |
+
� Access to infrastructure
|
601 |
+
� Implementation capacity
|
602 |
+
� Participation of the private sector and the general
|
603 |
+
public
|
604 |
+
� Financing capacity
|
605 |
+
� Ensure equitable access to transport infrastructure and
|
606 |
+
services
|
607 |
+
� Improve transport safety
|
608 |
+
� Make logistics services fast and reliable
|
609 |
+
� Build transport infrastructure and service that is
|
610 |
+
resilient to climate change
|
611 |
+
Focus areas
|
612 |
+
7.3. Enabling sectors
|
613 |
+
Energy
|
614 |
+
Objectives
|
615 |
+
? Equity in access to electricity services
|
616 |
+
? Energy access and quality
|
617 |
+
? Alternative sources of energy
|
618 |
+
? Reliability of electricity infrastructure
|
619 |
+
? Investment and income in energy subsector
|
620 |
+
� Ensure equitable access to transport
|
621 |
+
infrastructure and services
|
622 |
+
� Improve transport safety
|
623 |
+
� Make logistics services fast and reliable
|
624 |
+
� Build transport infrastructure and service that is
|
625 |
+
resilient to climate change
|
626 |
+
Focus areas
|
627 |
+
7.3. Enabling sectors
|
628 |
+
Irrigation
|
629 |
+
Objectives
|
630 |
+
? Medium and large scale irrigation infrastructure
|
631 |
+
? Job creation
|
632 |
+
? Share of government expenditure and alternative
|
633 |
+
financing options
|
634 |
+
? Institutional capacity and human resource
|
635 |
+
development
|
636 |
+
? Improve agricultural output and productivity
|
637 |
+
? Reduce government spending and enhance
|
638 |
+
institutional capacity and human resources
|
639 |
+
development
|
640 |
+
? Ensure the inclusion of all genders and
|
641 |
+
disabled citizens
|
642 |
+
? Develop alternative financing options for
|
643 |
+
irrigation development
|
644 |
+
Focus areas
|
645 |
+
7.3. Enabling sectors
|
646 |
+
Human capital development
|
647 |
+
Objectives
|
648 |
+
� Make education and training inclusive and equitable by
|
649 |
+
harmonizing the system with ability, need and capacity
|
650 |
+
� Develop capacity of educational institutions (teacher capacity,
|
651 |
+
inputs and technology)
|
652 |
+
� Establish education and training quality assurance system
|
653 |
+
� Avail free and compulsory education for pre-primary to junior
|
654 |
+
secondary levels and free education at the senior secondary levels
|
655 |
+
equitably
|
656 |
+
� Ensure the relevance of education and training system and
|
657 |
+
synchronize education policy with economic and social
|
658 |
+
development needs
|
659 |
+
� Make the education and training policy compatible with the
|
660 |
+
nation�s contemporary capacities as well as global and regional
|
661 |
+
market opportunities
|
662 |
+
� Enhance commitment, capability and responsibility of citizens
|
663 |
+
? Ensure equitable and quality health services
|
664 |
+
? Raise average life expectancy
|
665 |
+
? Achieve universal health coverage through
|
666 |
+
proactive and prevention health system
|
667 |
+
? Curtail preventable maternal and child deaths
|
668 |
+
? Reduce incidences of contagious and noncontagious
|
669 |
+
related diseases and deaths
|
670 |
+
? Build capacity for health tourism through
|
671 |
+
increased treatment capabilities
|
672 |
+
? Create a healthy society that is free from
|
673 |
+
addictions and use technology for supporting
|
674 |
+
knowledge led economic development
|
675 |
+
Focus areas
|
676 |
+
8 Nationally, regionally and locally balanced and competitive development
|
677 |
+
1. Lack of synchronization of investment with
|
678 |
+
resource potentials and development needs
|
679 |
+
2. Poor alignment of federal, regional and
|
680 |
+
district level investment plans with the
|
681 |
+
national development goals and envisioned
|
682 |
+
settlement patterns
|
683 |
+
3. Poor regional coordination due to low
|
684 |
+
consideration for trans-regional and
|
685 |
+
spatial issues in development plans of
|
686 |
+
regional states
|
687 |
+
4. Inter-regional and intra-regional
|
688 |
+
disparities in infrastructural development
|
689 |
+
and access to services
|
690 |
+
Challenges
|
691 |
+
8. Nationally, regionally and locally balanced and competitive development
|
692 |
+
1. Ensure that the investment flow and
|
693 |
+
infrastructural development plans fairly go hand in
|
694 |
+
hand with resource potential and development
|
695 |
+
needs
|
696 |
+
?Developing underutilized natural resources
|
697 |
+
?Equitable distribution and access to
|
698 |
+
infrastructure
|
699 |
+
?Sustainable environmental protection
|
700 |
+
2. Ensure the inclusion of pastoral and agro-pastoral
|
701 |
+
areas in the development
|
702 |
+
?Focused infrastructural development in pastoral
|
703 |
+
areas such as education and health sector input
|
704 |
+
provision as well as governance
|
705 |
+
?Market linkages with other areas and the central
|
706 |
+
markets
|
707 |
+
?Improve rural finance (credit and insurance) to
|
708 |
+
encourage fattening, milk processing, leather
|
709 |
+
production and irrigation agriculture
|
710 |
+
Focus areas
|
711 |
+
9. Monitoring and Evaluation
|
712 |
+
10 Years Perspective
|
713 |
+
Plan KPIs
|
714 |
+
Federal Implementing
|
715 |
+
Institutions
|
716 |
+
Planning and
|
717 |
+
Development Commission
|
718 |
+
Generate Data (Census,
|
719 |
+
Sample and administrative
|
720 |
+
data)
|
721 |
+
Annual Reports
|
722 |
+
Dialogue forums
|
723 |
+
(Civic Organizations, professional
|
724 |
+
associations, development partners,
|
725 |
+
intellectuals)
|
726 |
+
Central Statistical Agency
|
727 |
+
Database
|
728 |
+
National
|
729 |
+
Information Portal
|
730 |
+
National Statistics
|
731 |
+
Development Strategic
|
732 |
+
plan
|
733 |
+
Evaluation Reports
|
734 |
+
Prime Minister�s Office
|
735 |
+
House of People�s
|
736 |
+
Representatives
|
737 |
+
Thank you!
|
docStore/sample/South Africa_s Low Emission Development Strategy.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
docStore/sample/files.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{"Ethiopia: 10 Year Development Plan":"docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt",
|
2 |
+
"South Africa:Low Emission strategy":"docStore/sample/South Africa_s Low Emission Development Strategy.txt"
|
3 |
+
}
|
docStore/sample/keywordexample.json
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Climate":"Climate,Adaptation,Mitigation,Decarbonization,Carbon neutrality,Net zero Emissions",
|
3 |
+
"Food":"Food security,Nutrition,Diets,Food loss",
|
4 |
+
"Implementation":"Implementation,transformation,reform,integration,strategy,policy",
|
5 |
+
"Nature":"Nature,Nature-based solutions,Biodiversity,Degradation",
|
6 |
+
"Social":"Indigenous,Local community(ies),Rural livelihoods,Minority",
|
7 |
+
"Gender":"gender, women empowernment, women economic power, gender bias"
|
8 |
+
}
|
packages.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
poppler-utils
|
2 |
+
xpdf
|
3 |
+
tesseract-ocr
|
4 |
+
libtesseract-dev
|
paramconfig.cfg
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[lexical_search]
|
2 |
+
TOP_K = 20
|
3 |
+
SPLIT_BY = word
|
4 |
+
SPLIT_LENGTH = 120
|
5 |
+
SPLIT_OVERLAP = 0
|
6 |
+
REMOVE_PUNC = 0
|
7 |
+
|
8 |
+
[semantic_search]
|
9 |
+
RETRIEVER_TOP_K = 10
|
10 |
+
MAX_SEQ_LENGTH = 384
|
11 |
+
RETRIEVER = all-mpnet-base-v2
|
12 |
+
RETRIEVER_FORMAT = sentence_transformers
|
13 |
+
EMBEDDING_DIM = 768
|
14 |
+
RETRIEVER_EMB_LAYER = -1
|
15 |
+
READER = deepset/tinyroberta-squad2
|
16 |
+
READER_TOP_K = 10
|
17 |
+
READER_TOP_K_PER_CANDIDATE = 1
|
18 |
+
SPLIT_BY = word
|
19 |
+
SPLIT_LENGTH = 120
|
20 |
+
SPLIT_OVERLAP = 10
|
21 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
22 |
+
REMOVE_PUNC = 0
|
23 |
+
|
24 |
+
[sdg]
|
25 |
+
THRESHOLD = 0.85
|
26 |
+
MODEL = jonas/bert-base-uncased-finetuned-sdg
|
27 |
+
SPLIT_BY = word
|
28 |
+
REMOVE_PUNC = 0
|
29 |
+
SPLIT_LENGTH = 120
|
30 |
+
SPLIT_OVERLAP = 10
|
31 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
32 |
+
TOP_KEY = 15
|
33 |
+
|
34 |
+
|
35 |
+
[coherence]
|
36 |
+
RETRIEVER_TOP_K = 10
|
37 |
+
MAX_SEQ_LENGTH = 512
|
38 |
+
RETRIEVER = msmarco-distilbert-dot-v5
|
39 |
+
RETRIEVER_FORMAT = sentence_transformers
|
40 |
+
RETRIEVER_EMB_LAYER = -1
|
41 |
+
EMBEDDING_DIM = 768
|
42 |
+
THRESHOLD = 0.55
|
43 |
+
SPLIT_BY = word
|
44 |
+
SPLIT_LENGTH = 120
|
45 |
+
SPLIT_OVERLAP = 10
|
46 |
+
RESPECT_SENTENCE_BOUNDARY = 1
|
47 |
+
REMOVE_PUNC = 0
|
requirements.txt
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
farm-haystack == 1.10
|
2 |
+
farm-haystack[ocr]==1.10.0
|
3 |
+
spacy==3.2.0
|
4 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
|
5 |
+
matplotlib==3.5.1
|
6 |
+
nltk==3.7
|
7 |
+
numpy==1.22.1
|
8 |
+
pandas==1.4.0
|
9 |
+
pdfplumber==0.6.2
|
10 |
+
Pillow==9.1.1
|
11 |
+
seaborn==0.11.2
|
12 |
+
transformers==4.21.2
|
13 |
+
st-annotated-text==3.0.0
|
14 |
+
markdown==3.4.1
|
15 |
+
summa==1.2.0
|
16 |
+
altair==4.0
|
17 |
+
streamlit-aggrid
|
18 |
+
python-docx
|
19 |
+
streamlit_option_menu
|
style.css
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
.row-widget.stTextInput > div:first-of-type {
|
3 |
+
background: #fff;
|
4 |
+
display: flex;
|
5 |
+
border: 1px solid #dfe1e5;
|
6 |
+
box-shadow: none;
|
7 |
+
border-radius: 24px;
|
8 |
+
height: 50px;
|
9 |
+
width: auto;
|
10 |
+
margin: 10px auto 30px;
|
11 |
+
}
|
12 |
+
|
13 |
+
.row-widget.stTextInput > div:first-of-type:hover,
|
14 |
+
.row-widget.stTextInput > div:first-of-type:focus {
|
15 |
+
box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
|
16 |
+
}
|
17 |
+
|
18 |
+
.row-widget.stTextInput .st-bq {
|
19 |
+
background-color: #fff;
|
20 |
+
}
|
21 |
+
|
22 |
+
.row-widget.stTextInput > label {
|
23 |
+
color: #b3b3b3;
|
24 |
+
}
|
25 |
+
|
26 |
+
.row-widget.stButton > button {
|
27 |
+
border-radius: 24px;
|
28 |
+
background-color: #B6C9B1;
|
29 |
+
color: #fff;
|
30 |
+
border: none;
|
31 |
+
padding: 6px 20px;
|
32 |
+
float: right;
|
33 |
+
background-image: none;
|
34 |
+
}
|
35 |
+
|
36 |
+
.row-widget.stButton > button:hover {
|
37 |
+
box-shadow: 1px 1px 2px 1px rgba(0, 0, 0, 0.2);
|
38 |
+
}
|
39 |
+
|
40 |
+
.row-widget.stButton > button:focus {
|
41 |
+
border: none;
|
42 |
+
color: #fff;
|
43 |
+
}
|
44 |
+
|
45 |
+
.footer-custom {
|
46 |
+
position: fixed;
|
47 |
+
bottom: 0;
|
48 |
+
width: 100%;
|
49 |
+
color: var(--text-color);
|
50 |
+
max-width: 698px;
|
51 |
+
font-size: 14px;
|
52 |
+
height: 50px;
|
53 |
+
padding: 10px 0;
|
54 |
+
z-index: 50;
|
55 |
+
}
|
56 |
+
|
57 |
+
.main {
|
58 |
+
padding: 20px;
|
59 |
+
}
|
60 |
+
|
61 |
+
footer {
|
62 |
+
display: none !important;
|
63 |
+
}
|
64 |
+
|
65 |
+
.footer-custom a {
|
66 |
+
color: var(--text-color);
|
67 |
+
}
|
68 |
+
|
69 |
+
#wikipedia-assistant {
|
70 |
+
font-size: 36px;
|
71 |
+
}
|
72 |
+
|
73 |
+
.generated-answer p {
|
74 |
+
font-size: 16px;
|
75 |
+
font-weight: bold;
|
76 |
+
}
|
77 |
+
|
78 |
+
.react-json-view {
|
79 |
+
margin: 40px 0 80px;
|
80 |
+
}
|
81 |
+
|
82 |
+
.tooltip {
|
83 |
+
text-align: center;
|
84 |
+
line-height: 20px;
|
85 |
+
display: table-caption;
|
86 |
+
font-size: 10px;
|
87 |
+
border-radius: 50%;
|
88 |
+
height: 20px;
|
89 |
+
width: 20px;
|
90 |
+
position: relative;
|
91 |
+
cursor: pointer;
|
92 |
+
color:#000;
|
93 |
+
}
|
94 |
+
|
95 |
+
.tooltip .tooltiptext {
|
96 |
+
visibility: hidden;
|
97 |
+
width: 280px;
|
98 |
+
text-align: center;
|
99 |
+
border-radius: 6px;
|
100 |
+
padding: 10px;
|
101 |
+
position: absolute;
|
102 |
+
z-index: 1;
|
103 |
+
top: 25px;
|
104 |
+
left: 50%;
|
105 |
+
margin-left: -140px;
|
106 |
+
font-size: 14px;
|
107 |
+
background-color: #fff;
|
108 |
+
border: 1px solid #ccc;
|
109 |
+
box-shadow: 0px 0px 3px 1px rgba(0, 0, 0, 0.16);
|
110 |
+
color: #000;
|
111 |
+
}
|
112 |
+
|
113 |
+
.tooltip:hover .tooltiptext {
|
114 |
+
visibility: visible;
|
115 |
+
}
|
116 |
+
|
117 |
+
.sentence-wrapper {
|
118 |
+
border-left: 4px solid #ffc423;
|
119 |
+
padding-left: 20px;
|
120 |
+
margin-bottom: 40px;
|
121 |
+
}
|
122 |
+
|
123 |
+
#context {
|
124 |
+
padding: 2rem 0 1rem;
|
125 |
+
}
|
126 |
+
|
127 |
+
hr {
|
128 |
+
margin: 2em 0 1em;
|
129 |
+
}
|
130 |
+
|
131 |
+
|
132 |
+
.technical-details-info {
|
133 |
+
margin-bottom: 100px;
|
134 |
+
}
|
135 |
+
|
136 |
+
.loader-wrapper {
|
137 |
+
display: flex;
|
138 |
+
align-items: center;
|
139 |
+
background-color: rgba(250, 202, 43, 0.2);
|
140 |
+
padding: 15px 20px;
|
141 |
+
border-radius: 6px;
|
142 |
+
}
|
143 |
+
|
144 |
+
.loader-wrapper p {
|
145 |
+
margin-bottom: 0;
|
146 |
+
margin-left: 20px;
|
147 |
+
}
|
148 |
+
|
149 |
+
.loader {
|
150 |
+
width: 30px;
|
151 |
+
height: 30px;
|
152 |
+
border: dotted 5px #868686;
|
153 |
+
border-radius: 100%;
|
154 |
+
animation: spin 1s linear infinite;
|
155 |
+
}
|
156 |
+
|
157 |
+
.loader-note {
|
158 |
+
font-size: 14px;
|
159 |
+
color: #b3b3b3;
|
160 |
+
margin-left: 5px;
|
161 |
+
}
|
162 |
+
|
163 |
+
@keyframes spin {
|
164 |
+
0% {
|
165 |
+
transform: rotate(0deg) scale(0.8);
|
166 |
+
border-top-color: transparent;
|
167 |
+
border-right-color: transparent;
|
168 |
+
}
|
169 |
+
50% { transform: rotate(180deg) scale(1.2);
|
170 |
+
border-color: #949494;
|
171 |
+
border-top-color: transparent;
|
172 |
+
border-right-color: transparent;
|
173 |
+
}
|
174 |
+
100% { transform: rotate(360deg) scale(0.8);
|
175 |
+
border-color: #bbbbbb;
|
176 |
+
border-top-color: transparent;
|
177 |
+
border-right-color: transparent;
|
178 |
+
}
|
179 |
+
}
|
180 |
+
|
utils/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# adding for package implementation
|
utils/checkconfig.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import configparser
|
2 |
+
import logging
|
3 |
+
|
4 |
+
def getconfig(configfile_path:str):
|
5 |
+
"""
|
6 |
+
configfile_path: file path of .cfg file
|
7 |
+
"""
|
8 |
+
|
9 |
+
config = configparser.ConfigParser()
|
10 |
+
|
11 |
+
try:
|
12 |
+
config.read_file(open(configfile_path))
|
13 |
+
return config
|
14 |
+
except:
|
15 |
+
logging.warning("config file not found")
|
utils/keyword_extraction.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
3 |
+
# import nltk
|
4 |
+
# nltk.download('stopwords')
|
5 |
+
# from nltk.corpus import stopwords
|
6 |
+
import pickle
|
7 |
+
from typing import List, Text
|
8 |
+
import logging
|
9 |
+
from summa import keywords
|
10 |
+
|
11 |
+
try:
|
12 |
+
import streamlit as st
|
13 |
+
except ImportError:
|
14 |
+
logging.info("Streamlit not installed")
|
15 |
+
|
16 |
+
|
17 |
+
def sort_coo(coo_matrix):
|
18 |
+
"""
|
19 |
+
It takes Coordinate format scipy sparse matrix and extracts info from same.\
|
20 |
+
1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
|
21 |
+
"""
|
22 |
+
tuples = zip(coo_matrix.col, coo_matrix.data)
|
23 |
+
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
|
24 |
+
|
25 |
+
def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
|
26 |
+
"""get the feature names and tf-idf score of top n items
|
27 |
+
|
28 |
+
Params
|
29 |
+
---------
|
30 |
+
feature_names: list of words from vectorizer
|
31 |
+
sorted_items: tuple returned by sort_coo function defined in \
|
32 |
+
keyword_extraction.py
|
33 |
+
topn: topn words to be extracted using tfidf
|
34 |
+
|
35 |
+
Return
|
36 |
+
----------
|
37 |
+
results: top extracted keywords
|
38 |
+
|
39 |
+
"""
|
40 |
+
|
41 |
+
#use only topn items from vector
|
42 |
+
sorted_items = sorted_items[:top_n]
|
43 |
+
score_vals = []
|
44 |
+
feature_vals = []
|
45 |
+
|
46 |
+
# word index and corresponding tf-idf score
|
47 |
+
for idx, score in sorted_items:
|
48 |
+
|
49 |
+
#keep track of feature name and its corresponding score
|
50 |
+
score_vals.append(round(score, 3))
|
51 |
+
feature_vals.append(feature_names[idx])
|
52 |
+
|
53 |
+
results= {}
|
54 |
+
for idx in range(len(feature_vals)):
|
55 |
+
results[feature_vals[idx]]=score_vals[idx]
|
56 |
+
|
57 |
+
return results
|
58 |
+
|
59 |
+
|
60 |
+
def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
|
61 |
+
"""
|
62 |
+
TFIDF based keywords extraction
|
63 |
+
|
64 |
+
Params
|
65 |
+
---------
|
66 |
+
vectorizer: trained cont vectorizer model
|
67 |
+
tfidfmodel: TFIDF Tranformer model
|
68 |
+
top_n: Top N keywords to be extracted
|
69 |
+
textdata: text data to which needs keyword extraction
|
70 |
+
|
71 |
+
Return
|
72 |
+
----------
|
73 |
+
keywords: top extracted keywords
|
74 |
+
|
75 |
+
"""
|
76 |
+
features = vectorizer.get_feature_names_out()
|
77 |
+
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
|
78 |
+
sorted_items=sort_coo(tf_idf_vector.tocoo())
|
79 |
+
results=extract_topn_from_vector(features,sorted_items,top_n)
|
80 |
+
keywords = [keyword for keyword in results]
|
81 |
+
return keywords
|
82 |
+
|
83 |
+
def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
|
84 |
+
"""
|
85 |
+
TFIDF based keywords extraction
|
86 |
+
|
87 |
+
Params
|
88 |
+
---------
|
89 |
+
sdg: which sdg tfidf model to be used
|
90 |
+
sdgdata: text data to which needs keyword extraction
|
91 |
+
|
92 |
+
|
93 |
+
Return
|
94 |
+
----------
|
95 |
+
keywords: top extracted keywords
|
96 |
+
|
97 |
+
"""
|
98 |
+
model_path = "docStore/sdg{}/".format(sdg)
|
99 |
+
vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
|
100 |
+
tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
|
101 |
+
features = vectorizer.get_feature_names_out()
|
102 |
+
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
|
103 |
+
sorted_items=sort_coo(tf_idf_vector.tocoo())
|
104 |
+
top_n = top_n
|
105 |
+
results=extract_topn_from_vector(features,sorted_items,top_n)
|
106 |
+
keywords = [keyword for keyword in results]
|
107 |
+
return keywords
|
108 |
+
|
109 |
+
@st.cache(allow_output_mutation=True)
|
110 |
+
def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
|
111 |
+
"""
|
112 |
+
wrappper function to perform textrank, uses either ratio or wordcount to
|
113 |
+
extract top keywords limited by words or ratio.
|
114 |
+
1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
|
115 |
+
|
116 |
+
Params
|
117 |
+
--------
|
118 |
+
textdata: text data to perform the textrank.
|
119 |
+
ratio: float to limit the number of keywords as proportion of total token \
|
120 |
+
in textdata
|
121 |
+
words: number of keywords to be extracted. Takes priority over ratio if \
|
122 |
+
Non zero. Howevr incase the pagerank returns lesser keywords than \
|
123 |
+
compared to fix value then ratio is used.
|
124 |
+
|
125 |
+
Return
|
126 |
+
--------
|
127 |
+
results: extracted keywords
|
128 |
+
"""
|
129 |
+
if words == 0:
|
130 |
+
logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
|
131 |
+
results = keywords.keywords(textdata, ratio= ratio).split("\n")
|
132 |
+
else:
|
133 |
+
try:
|
134 |
+
results = keywords.keywords(textdata, words= words).split("\n")
|
135 |
+
except:
|
136 |
+
results = keywords.keywords(textdata, ratio = ratio).split("\n")
|
137 |
+
|
138 |
+
return results
|
139 |
+
|
140 |
+
|
utils/lexical_search.py
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TfidfRetriever
|
2 |
+
from haystack.document_stores import InMemoryDocumentStore
|
3 |
+
import spacy
|
4 |
+
import re
|
5 |
+
from spacy.matcher import Matcher
|
6 |
+
from markdown import markdown
|
7 |
+
from annotated_text import annotation
|
8 |
+
from haystack.schema import Document
|
9 |
+
from typing import List, Text, Tuple
|
10 |
+
from typing_extensions import Literal
|
11 |
+
from utils.preprocessing import processingpipeline
|
12 |
+
from utils.streamlitcheck import check_streamlit
|
13 |
+
import logging
|
14 |
+
try:
|
15 |
+
from termcolor import colored
|
16 |
+
except:
|
17 |
+
pass
|
18 |
+
|
19 |
+
try:
|
20 |
+
import streamlit as st
|
21 |
+
except ImportError:
|
22 |
+
logging.info("Streamlit not installed")
|
23 |
+
|
24 |
+
|
25 |
+
def runLexicalPreprocessingPipeline(file_name:str,file_path:str,
|
26 |
+
split_by: Literal["sentence", "word"] = 'word',
|
27 |
+
split_length:int = 80, split_overlap:int = 0,
|
28 |
+
remove_punc:bool = False,)->List[Document]:
|
29 |
+
"""
|
30 |
+
creates the pipeline and runs the preprocessing pipeline,
|
31 |
+
the params for pipeline are fetched from paramconfig. As lexical doesnt gets
|
32 |
+
affected by overlap, threfore split_overlap = 0 in default paramconfig and
|
33 |
+
split_by = word.
|
34 |
+
|
35 |
+
Params
|
36 |
+
------------
|
37 |
+
|
38 |
+
file_name: filename, in case of streamlit application use
|
39 |
+
st.session_state['filename']
|
40 |
+
file_path: filepath, in case of streamlit application use
|
41 |
+
st.session_state['filepath']
|
42 |
+
split_by: document splitting strategy either as word or sentence
|
43 |
+
split_length: when synthetically creating the paragrpahs from document,
|
44 |
+
it defines the length of paragraph.
|
45 |
+
split_overlap: Number of words or sentences that overlap when creating
|
46 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
47 |
+
when read in together with others. Therefore the overlap is used.
|
48 |
+
splititng of text.
|
49 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
50 |
+
|
51 |
+
Return
|
52 |
+
--------------
|
53 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
54 |
+
has four objects. For the lexicaal search using TFIDFRetriever we
|
55 |
+
need to use the List of Haystack Document, which can be fetched by
|
56 |
+
key = 'documents' on output.
|
57 |
+
|
58 |
+
"""
|
59 |
+
|
60 |
+
lexical_processing_pipeline = processingpipeline()
|
61 |
+
|
62 |
+
|
63 |
+
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
64 |
+
params= {"FileConverter": {"file_path": file_path, \
|
65 |
+
"file_name": file_name},
|
66 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
67 |
+
"split_by": split_by, \
|
68 |
+
"split_length":split_length,\
|
69 |
+
"split_overlap": split_overlap}})
|
70 |
+
|
71 |
+
return output_lexical_pre
|
72 |
+
|
73 |
+
|
74 |
+
def tokenize_lexical_query(query:str)-> List[str]:
|
75 |
+
"""
|
76 |
+
Removes the stop words from query and returns the list of important keywords
|
77 |
+
in query. For the lexical search the relevent paragraphs in document are
|
78 |
+
retreived using TfIDFretreiver from Haystack. However to highlight these
|
79 |
+
keywords we need the tokenized form of query.
|
80 |
+
|
81 |
+
Params
|
82 |
+
--------
|
83 |
+
query: string which represents either list of keywords user is looking for
|
84 |
+
or a query in form of Question.
|
85 |
+
|
86 |
+
Return
|
87 |
+
-----------
|
88 |
+
token_list: list of important keywords in the query.
|
89 |
+
|
90 |
+
"""
|
91 |
+
nlp = spacy.load("en_core_web_sm")
|
92 |
+
token_list = [token.text.lower() for token in nlp(query)
|
93 |
+
if not (token.is_stop or token.is_punct)]
|
94 |
+
return token_list
|
95 |
+
|
96 |
+
def runSpacyMatcher(token_list:List[str], document:Text
|
97 |
+
)->Tuple[List[List[int]],spacy.tokens.doc.Doc]:
|
98 |
+
"""
|
99 |
+
Using the spacy in backend finds the keywords in the document using the
|
100 |
+
Matcher class from spacy. We can alternatively use the regex, but spacy
|
101 |
+
finds all keywords in serialized manner which helps in annotation of answers.
|
102 |
+
|
103 |
+
Params
|
104 |
+
-------
|
105 |
+
token_list: this is token list which tokenize_lexical_query function returns
|
106 |
+
document: text in which we need to find the tokens
|
107 |
+
|
108 |
+
Return
|
109 |
+
--------
|
110 |
+
matches: List of [start_index, end_index] in the spacydoc(at word level not
|
111 |
+
character) for the keywords in token list.
|
112 |
+
|
113 |
+
spacydoc: the keyword index in the spacydoc are at word level and not character,
|
114 |
+
therefore to allow the annotator to work seamlessly we return the spacydoc.
|
115 |
+
|
116 |
+
"""
|
117 |
+
nlp = spacy.load("en_core_web_sm")
|
118 |
+
spacydoc = nlp(document)
|
119 |
+
matcher = Matcher(nlp.vocab)
|
120 |
+
token_pattern = [[{"LOWER":token}] for token in token_list]
|
121 |
+
matcher.add(",".join(token_list), token_pattern)
|
122 |
+
spacymatches = matcher(spacydoc)
|
123 |
+
|
124 |
+
# getting start and end index in spacydoc so that annotator can work seamlessly
|
125 |
+
matches = []
|
126 |
+
for match_id, start, end in spacymatches:
|
127 |
+
matches = matches + [[start, end]]
|
128 |
+
|
129 |
+
return matches, spacydoc
|
130 |
+
|
131 |
+
def runRegexMatcher(token_list:List[str], document:Text):
|
132 |
+
"""
|
133 |
+
Using the regex in backend finds the keywords in the document.
|
134 |
+
|
135 |
+
Params
|
136 |
+
-------
|
137 |
+
token_list: this is token list which tokenize_lexical_query function returns
|
138 |
+
|
139 |
+
document: text in which we need to find the tokens
|
140 |
+
|
141 |
+
Return
|
142 |
+
--------
|
143 |
+
matches: List of [start_index, end_index] in the document for the keywords
|
144 |
+
in token list at character level.
|
145 |
+
|
146 |
+
document: the keyword index returned by regex are at character level,
|
147 |
+
therefore to allow the annotator to work seamlessly we return the text back.
|
148 |
+
|
149 |
+
"""
|
150 |
+
matches = []
|
151 |
+
for token in token_list:
|
152 |
+
matches = (matches +
|
153 |
+
[[val.start(), val.start() +
|
154 |
+
len(token)] for val in re.finditer(token, document)])
|
155 |
+
|
156 |
+
return matches, document
|
157 |
+
|
158 |
+
def spacyAnnotator(matches: List[List[int]], document:spacy.tokens.doc.Doc):
|
159 |
+
"""
|
160 |
+
This is spacy Annotator and needs spacy.doc
|
161 |
+
Annotates the text in the document defined by list of [start index, end index]
|
162 |
+
Example: "How are you today", if document type is text, matches = [[0,3]]
|
163 |
+
will give answer = "How", however in case we used the spacy matcher then the
|
164 |
+
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
165 |
+
to find "How" then the matches = [[0,1]] for the string defined above.
|
166 |
+
|
167 |
+
Params
|
168 |
+
-----------
|
169 |
+
matches: As mentioned its list of list. Example [[0,1],[10,13]]
|
170 |
+
document: document which needs to be indexed.
|
171 |
+
|
172 |
+
|
173 |
+
Return
|
174 |
+
--------
|
175 |
+
will send the output to either app front end using streamlit or
|
176 |
+
write directly to output screen.
|
177 |
+
|
178 |
+
"""
|
179 |
+
start = 0
|
180 |
+
annotated_text = ""
|
181 |
+
for match in matches:
|
182 |
+
start_idx = match[0]
|
183 |
+
end_idx = match[1]
|
184 |
+
|
185 |
+
if check_streamlit():
|
186 |
+
annotated_text = (annotated_text + document[start:start_idx].text
|
187 |
+
+ str(annotation(body=document[start_idx:end_idx].text,
|
188 |
+
label="ANSWER", background="#964448", color='#ffffff')))
|
189 |
+
else:
|
190 |
+
annotated_text = (annotated_text + document[start:start_idx].text
|
191 |
+
+ colored(document[start_idx:end_idx].text,
|
192 |
+
"green", attrs = ['bold']))
|
193 |
+
|
194 |
+
|
195 |
+
start = end_idx
|
196 |
+
|
197 |
+
annotated_text = annotated_text + document[end_idx:].text
|
198 |
+
|
199 |
+
|
200 |
+
if check_streamlit():
|
201 |
+
|
202 |
+
st.write(
|
203 |
+
markdown(annotated_text),
|
204 |
+
unsafe_allow_html=True,
|
205 |
+
)
|
206 |
+
else:
|
207 |
+
print(annotated_text)
|
208 |
+
|
209 |
+
def lexical_search(query:Text, documents:List[Document],top_k:int):
|
210 |
+
"""
|
211 |
+
Performs the Lexical search on the List of haystack documents which is
|
212 |
+
returned by preprocessing Pipeline.
|
213 |
+
|
214 |
+
Params
|
215 |
+
-------
|
216 |
+
query: Keywords that need to be searche in documents.
|
217 |
+
documents: List of Haystack documents returned by preprocessing pipeline.
|
218 |
+
top_k: Number of Top results to be fetched.
|
219 |
+
|
220 |
+
"""
|
221 |
+
|
222 |
+
document_store = InMemoryDocumentStore()
|
223 |
+
document_store.write_documents(documents)
|
224 |
+
|
225 |
+
# Haystack Retriever works with document stores only.
|
226 |
+
retriever = TfidfRetriever(document_store)
|
227 |
+
results = retriever.retrieve(query=query, top_k = top_k)
|
228 |
+
query_tokens = tokenize_lexical_query(query)
|
229 |
+
flag = True
|
230 |
+
for count, result in enumerate(results):
|
231 |
+
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
232 |
+
|
233 |
+
if len(matches) != 0:
|
234 |
+
if flag:
|
235 |
+
flag = False
|
236 |
+
if check_streamlit():
|
237 |
+
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
238 |
+
else:
|
239 |
+
print("Top few lexical search (TFIDF) hits")
|
240 |
+
|
241 |
+
if check_streamlit():
|
242 |
+
st.write("Result {}".format(count+1))
|
243 |
+
else:
|
244 |
+
print("Results {}".format(count +1))
|
245 |
+
spacyAnnotator(matches, doc)
|
246 |
+
|
247 |
+
if flag:
|
248 |
+
if check_streamlit():
|
249 |
+
st.info("🤔 No relevant result found. Please try another keyword.")
|
250 |
+
else:
|
251 |
+
print("No relevant result found. Please try another keyword.")
|
utils/ndc_explorer.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import urllib.request
|
3 |
+
import json
|
4 |
+
|
5 |
+
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
|
6 |
+
def get_document(country_code: str):
|
7 |
+
"""
|
8 |
+
read the country NDC data from
|
9 |
+
https://klimalog.die-gdi.de/ndc/open-data/dataset.json
|
10 |
+
using the country code.
|
11 |
+
|
12 |
+
Params
|
13 |
+
-------
|
14 |
+
country_code:"""
|
15 |
+
with urllib.request.urlopen(link) as urlfile:
|
16 |
+
data = json.loads(urlfile.read())
|
17 |
+
categoriesData = {}
|
18 |
+
categoriesData['categories']= data['categories']
|
19 |
+
categoriesData['subcategories']= data['subcategories']
|
20 |
+
keys_sub = categoriesData['subcategories'].keys()
|
21 |
+
documentType= 'NDCs'
|
22 |
+
if documentType in data.keys():
|
23 |
+
if country_code in data[documentType].keys():
|
24 |
+
get_dict = {}
|
25 |
+
for key, value in data[documentType][country_code].items():
|
26 |
+
if key not in ['country_name','region_id', 'region_name']:
|
27 |
+
get_dict[key] = value['classification']
|
28 |
+
else:
|
29 |
+
get_dict[key] = value
|
30 |
+
else:
|
31 |
+
return None
|
32 |
+
else:
|
33 |
+
return None
|
34 |
+
|
35 |
+
country = {}
|
36 |
+
for key in categoriesData['categories']:
|
37 |
+
country[key]= {}
|
38 |
+
for key,value in categoriesData['subcategories'].items():
|
39 |
+
country[value['category']][key] = get_dict[key]
|
40 |
+
|
41 |
+
return country
|
42 |
+
|
43 |
+
|
44 |
+
def countrySpecificCCA(cca_sent:dict, threshold:int, countryCode:str):
|
45 |
+
"""
|
46 |
+
based on the countrycode, reads the country data from
|
47 |
+
https://klimalog.die-gdi.de/ndc/open-data/dataset.json
|
48 |
+
using get_documents from utils.ndc_explorer.py
|
49 |
+
then based on thereshold value filters the Climate Change Adaptation
|
50 |
+
targets assigned by NDC explorer team to that country. Using the sentences
|
51 |
+
create by Data services team of GIZ for each target level, tries to find the
|
52 |
+
relevant passages from the document by doing the semantic search.
|
53 |
+
|
54 |
+
Params
|
55 |
+
-------
|
56 |
+
cca_sent: dictionary with key as 'target labels' and manufactured sentences
|
57 |
+
reflecting the target level. Please see the docStore/ndcs/cca.txt
|
58 |
+
|
59 |
+
threshold: NDC target have many categoriees ranging from [0-5], with 0
|
60 |
+
refelcting most relaxed attitude and 5 being most aggrisive towards Climate
|
61 |
+
change. We select the threshold value beyond which we need to focus on.
|
62 |
+
|
63 |
+
countryCode: standard country code to allow us to fetch the country specific
|
64 |
+
data.
|
65 |
+
|
66 |
+
"""
|
67 |
+
temp = {}
|
68 |
+
doc = get_document(countryCode)
|
69 |
+
for key,value in cca_sent.items():
|
70 |
+
id_ = doc['climate change adaptation'][key]['id']
|
71 |
+
if id_ >threshold:
|
72 |
+
temp[key] = value['id'][id_]
|
73 |
+
return temp
|
74 |
+
|
75 |
+
|
76 |
+
def countrySpecificCCM(ccm_sent, threshold, countryCode):
|
77 |
+
"""
|
78 |
+
see the documentation of countrySpecificCCA. This is same instead of
|
79 |
+
this gets the data pertaining to Adaptation
|
80 |
+
|
81 |
+
"""
|
82 |
+
|
83 |
+
temp = {}
|
84 |
+
doc = get_document(countryCode)
|
85 |
+
for key,value in ccm_sent.items():
|
86 |
+
id_ = doc['climate change mitigation'][key]['id']
|
87 |
+
if id_ >threshold:
|
88 |
+
temp[key] = value['id'][id_]
|
89 |
+
|
90 |
+
return temp
|
utils/preprocessing.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes.base import BaseComponent
|
2 |
+
from haystack.schema import Document
|
3 |
+
from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
|
4 |
+
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
|
5 |
+
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
|
6 |
+
from typing_extensions import Literal
|
7 |
+
import pandas as pd
|
8 |
+
import logging
|
9 |
+
import re
|
10 |
+
import string
|
11 |
+
from haystack.pipelines import Pipeline
|
12 |
+
|
13 |
+
def useOCR(file_path: str)-> Text:
|
14 |
+
"""
|
15 |
+
Converts image pdfs into text, Using the Farm-haystack[OCR]
|
16 |
+
|
17 |
+
Params
|
18 |
+
----------
|
19 |
+
file_path: file_path of uploade file, returned by add_upload function in
|
20 |
+
uploadAndExample.py
|
21 |
+
|
22 |
+
Returns the text file as string.
|
23 |
+
"""
|
24 |
+
|
25 |
+
|
26 |
+
converter = PDFToTextOCRConverter(remove_numeric_tables=True,
|
27 |
+
valid_languages=["eng"])
|
28 |
+
docs = converter.convert(file_path=file_path, meta=None)
|
29 |
+
return docs[0].content
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
class FileConverter(BaseComponent):
|
35 |
+
"""
|
36 |
+
Wrapper class to convert uploaded document into text by calling appropriate
|
37 |
+
Converter class, will use internally haystack PDFToTextOCR in case of image
|
38 |
+
pdf. Cannot use the FileClassifier from haystack as its doesnt has any
|
39 |
+
label/output class for image.
|
40 |
+
|
41 |
+
1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
|
42 |
+
2. https://docs.haystack.deepset.ai/docs/file_converters
|
43 |
+
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
|
44 |
+
4. https://docs.haystack.deepset.ai/reference/file-converters-api
|
45 |
+
|
46 |
+
|
47 |
+
"""
|
48 |
+
|
49 |
+
outgoing_edges = 1
|
50 |
+
|
51 |
+
def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
|
52 |
+
id_hash_keys: Optional[List[str]] = None,
|
53 |
+
) -> Tuple[dict,str]:
|
54 |
+
""" this is required method to invoke the component in
|
55 |
+
the pipeline implementation.
|
56 |
+
|
57 |
+
Params
|
58 |
+
----------
|
59 |
+
file_name: name of file
|
60 |
+
file_path: file_path of uploade file, returned by add_upload function in
|
61 |
+
uploadAndExample.py
|
62 |
+
|
63 |
+
See the links provided in Class docstring/description to see other params
|
64 |
+
|
65 |
+
Return
|
66 |
+
---------
|
67 |
+
output: dictionary, with key as identifier and value could be anything
|
68 |
+
we need to return. In this case its the List of Hasyatck Document
|
69 |
+
|
70 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
71 |
+
"""
|
72 |
+
try:
|
73 |
+
if file_name.endswith('.pdf'):
|
74 |
+
converter = PDFToTextConverter(remove_numeric_tables=True)
|
75 |
+
if file_name.endswith('.txt'):
|
76 |
+
converter = TextConverter(remove_numeric_tables=True)
|
77 |
+
if file_name.endswith('.docx'):
|
78 |
+
converter = DocxToTextConverter()
|
79 |
+
except Exception as e:
|
80 |
+
logging.error(e)
|
81 |
+
return
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
documents = []
|
86 |
+
|
87 |
+
document = converter.convert(
|
88 |
+
file_path=file_path, meta=None,
|
89 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
90 |
+
)[0]
|
91 |
+
|
92 |
+
text = document.content
|
93 |
+
|
94 |
+
# if file is image pdf then it will have {'content': "\x0c\x0c\x0c\x0c"}
|
95 |
+
# subsitute this substring with '',and check if content is empty string
|
96 |
+
|
97 |
+
text = re.sub(r'\x0c', '', text)
|
98 |
+
documents.append(Document(content=text,
|
99 |
+
meta={"name": file_name},
|
100 |
+
id_hash_keys=id_hash_keys))
|
101 |
+
|
102 |
+
|
103 |
+
# check if text is empty and apply pdfOCR converter.
|
104 |
+
for i in documents:
|
105 |
+
if i.content == "":
|
106 |
+
logging.info("Using OCR")
|
107 |
+
i.content = useOCR(file_path)
|
108 |
+
|
109 |
+
logging.info('file conversion succesful')
|
110 |
+
output = {'documents': documents}
|
111 |
+
return output, 'output_1'
|
112 |
+
|
113 |
+
def run_batch():
|
114 |
+
"""
|
115 |
+
we dont have requirement to process the multiple files in one go
|
116 |
+
therefore nothing here, however to use the custom node we need to have
|
117 |
+
this method for the class.
|
118 |
+
"""
|
119 |
+
|
120 |
+
return
|
121 |
+
|
122 |
+
|
123 |
+
def basic(s:str, remove_punc:bool = False):
|
124 |
+
|
125 |
+
"""
|
126 |
+
Performs basic cleaning of text.
|
127 |
+
|
128 |
+
Params
|
129 |
+
----------
|
130 |
+
s: string to be processed
|
131 |
+
removePunc: to remove all Punctuation including ',' and '.' or not
|
132 |
+
|
133 |
+
Returns: processed string: see comments in the source code for more info
|
134 |
+
"""
|
135 |
+
|
136 |
+
# Remove URLs
|
137 |
+
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
138 |
+
s = re.sub(r"http\S+", " ", s)
|
139 |
+
|
140 |
+
# Remove new line characters
|
141 |
+
s = re.sub('\n', ' ', s)
|
142 |
+
|
143 |
+
# Remove punctuations
|
144 |
+
if remove_punc == True:
|
145 |
+
translator = str.maketrans(' ', ' ', string.punctuation)
|
146 |
+
s = s.translate(translator)
|
147 |
+
# Remove distracting single quotes and dotted pattern
|
148 |
+
s = re.sub("\'", " ", s)
|
149 |
+
s = s.replace("..","")
|
150 |
+
|
151 |
+
return s.strip()
|
152 |
+
|
153 |
+
|
154 |
+
class UdfPreProcessor(BaseComponent):
|
155 |
+
"""
|
156 |
+
class to preprocess the document returned by FileConverter. It will check
|
157 |
+
for splitting strategy and splits the document by word or sentences and then
|
158 |
+
synthetically create the paragraphs.
|
159 |
+
|
160 |
+
1. https://docs.haystack.deepset.ai/docs/preprocessor
|
161 |
+
2. https://docs.haystack.deepset.ai/reference/preprocessor-api
|
162 |
+
3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
|
163 |
+
|
164 |
+
"""
|
165 |
+
outgoing_edges = 1
|
166 |
+
|
167 |
+
def run(self, documents:List[Document], remove_punc:bool=False,
|
168 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
169 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
170 |
+
split_overlap:int = 0):
|
171 |
+
|
172 |
+
""" this is required method to invoke the component in
|
173 |
+
the pipeline implementation.
|
174 |
+
|
175 |
+
Params
|
176 |
+
----------
|
177 |
+
documents: documents from the output dictionary returned by Fileconverter
|
178 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
179 |
+
split_by: document splitting strategy either as word or sentence
|
180 |
+
split_length: when synthetically creating the paragrpahs from document,
|
181 |
+
it defines the length of paragraph.
|
182 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
183 |
+
splititng of text.
|
184 |
+
split_overlap: Number of words or sentences that overlap when creating
|
185 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
186 |
+
when read in together with others. Therefore the overlap is used.
|
187 |
+
|
188 |
+
Return
|
189 |
+
---------
|
190 |
+
output: dictionary, with key as identifier and value could be anything
|
191 |
+
we need to return. In this case the output will contain 4 objects
|
192 |
+
the paragraphs text list as List, Haystack document, Dataframe and
|
193 |
+
one raw text file.
|
194 |
+
|
195 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
196 |
+
|
197 |
+
"""
|
198 |
+
|
199 |
+
if split_by == 'sentence':
|
200 |
+
split_respect_sentence_boundary = False
|
201 |
+
|
202 |
+
else:
|
203 |
+
split_respect_sentence_boundary = split_respect_sentence_boundary
|
204 |
+
|
205 |
+
preprocessor = PreProcessor(
|
206 |
+
clean_empty_lines=True,
|
207 |
+
clean_whitespace=True,
|
208 |
+
clean_header_footer=True,
|
209 |
+
split_by=split_by,
|
210 |
+
split_length=split_length,
|
211 |
+
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
212 |
+
split_overlap=split_overlap,
|
213 |
+
|
214 |
+
# will add page number only in case of PDF not for text/docx file.
|
215 |
+
add_page_number=True
|
216 |
+
)
|
217 |
+
|
218 |
+
for i in documents:
|
219 |
+
# # basic cleaning before passing it to preprocessor.
|
220 |
+
# i = basic(i)
|
221 |
+
docs_processed = preprocessor.process([i])
|
222 |
+
for item in docs_processed:
|
223 |
+
item.content = basic(item.content, remove_punc= remove_punc)
|
224 |
+
|
225 |
+
df = pd.DataFrame(docs_processed)
|
226 |
+
all_text = " ".join(df.content.to_list())
|
227 |
+
para_list = df.content.to_list()
|
228 |
+
logging.info('document split into {} paragraphs'.format(len(para_list)))
|
229 |
+
output = {'documents': docs_processed,
|
230 |
+
'dataframe': df,
|
231 |
+
'text': all_text,
|
232 |
+
'paraList': para_list
|
233 |
+
}
|
234 |
+
return output, "output_1"
|
235 |
+
def run_batch():
|
236 |
+
"""
|
237 |
+
we dont have requirement to process the multiple files in one go
|
238 |
+
therefore nothing here, however to use the custom node we need to have
|
239 |
+
this method for the class.
|
240 |
+
"""
|
241 |
+
return
|
242 |
+
|
243 |
+
def processingpipeline():
|
244 |
+
"""
|
245 |
+
Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor
|
246 |
+
from utils.preprocessing
|
247 |
+
|
248 |
+
"""
|
249 |
+
|
250 |
+
preprocessing_pipeline = Pipeline()
|
251 |
+
file_converter = FileConverter()
|
252 |
+
custom_preprocessor = UdfPreProcessor()
|
253 |
+
|
254 |
+
preprocessing_pipeline.add_node(component=file_converter,
|
255 |
+
name="FileConverter", inputs=["File"])
|
256 |
+
preprocessing_pipeline.add_node(component = custom_preprocessor,
|
257 |
+
name ='UdfPreProcessor', inputs=["FileConverter"])
|
258 |
+
|
259 |
+
return preprocessing_pipeline
|
260 |
+
|
utils/sdg_classifier.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TransformersDocumentClassifier
|
2 |
+
from haystack.schema import Document
|
3 |
+
from typing import List, Tuple
|
4 |
+
from typing_extensions import Literal
|
5 |
+
import logging
|
6 |
+
import pandas as pd
|
7 |
+
from pandas import DataFrame, Series
|
8 |
+
from utils.checkconfig import getconfig
|
9 |
+
from utils.streamlitcheck import check_streamlit
|
10 |
+
from utils.preprocessing import processingpipeline
|
11 |
+
try:
|
12 |
+
import streamlit as st
|
13 |
+
except ImportError:
|
14 |
+
logging.info("Streamlit not installed")
|
15 |
+
|
16 |
+
## Labels dictionary ###
|
17 |
+
_lab_dict = {0: 'no_cat',
|
18 |
+
1:'SDG 1 - No poverty',
|
19 |
+
2:'SDG 2 - Zero hunger',
|
20 |
+
3:'SDG 3 - Good health and well-being',
|
21 |
+
4:'SDG 4 - Quality education',
|
22 |
+
5:'SDG 5 - Gender equality',
|
23 |
+
6:'SDG 6 - Clean water and sanitation',
|
24 |
+
7:'SDG 7 - Affordable and clean energy',
|
25 |
+
8:'SDG 8 - Decent work and economic growth',
|
26 |
+
9:'SDG 9 - Industry, Innovation and Infrastructure',
|
27 |
+
10:'SDG 10 - Reduced inequality',
|
28 |
+
11:'SDG 11 - Sustainable cities and communities',
|
29 |
+
12:'SDG 12 - Responsible consumption and production',
|
30 |
+
13:'SDG 13 - Climate action',
|
31 |
+
14:'SDG 14 - Life below water',
|
32 |
+
15:'SDG 15 - Life on land',
|
33 |
+
16:'SDG 16 - Peace, justice and strong institutions',
|
34 |
+
17:'SDG 17 - Partnership for the goals',}
|
35 |
+
|
36 |
+
@st.cache(allow_output_mutation=True)
|
37 |
+
def load_sdgClassifier(config_file:str = None, classifier_name:str = None):
|
38 |
+
"""
|
39 |
+
loads the document classifier using haystack, where the name/path of model
|
40 |
+
in HF-hub as string is used to fetch the model object.Either configfile or
|
41 |
+
model should be passed.
|
42 |
+
1. https://docs.haystack.deepset.ai/reference/document-classifier-api
|
43 |
+
2. https://docs.haystack.deepset.ai/docs/document_classifier
|
44 |
+
|
45 |
+
Params
|
46 |
+
--------
|
47 |
+
config_file: config file path from which to read the model name
|
48 |
+
classifier_name: if modelname is passed, it takes a priority if not \
|
49 |
+
found then will look for configfile, else raise error.
|
50 |
+
|
51 |
+
|
52 |
+
Return: document classifier model
|
53 |
+
"""
|
54 |
+
if not classifier_name:
|
55 |
+
if not config_file:
|
56 |
+
logging.warning("Pass either model name or config file")
|
57 |
+
return
|
58 |
+
else:
|
59 |
+
config = getconfig(config_file)
|
60 |
+
classifier_name = config.get('sdg','MODEL')
|
61 |
+
|
62 |
+
logging.info("Loading classifier")
|
63 |
+
doc_classifier = TransformersDocumentClassifier(
|
64 |
+
model_name_or_path=classifier_name,
|
65 |
+
task="text-classification")
|
66 |
+
|
67 |
+
return doc_classifier
|
68 |
+
|
69 |
+
|
70 |
+
@st.cache(allow_output_mutation=True)
|
71 |
+
def sdg_classification(haystack_doc:List[Document],
|
72 |
+
threshold:float = 0.8,
|
73 |
+
classifier_model:TransformersDocumentClassifier= None
|
74 |
+
)->Tuple[DataFrame,Series]:
|
75 |
+
"""
|
76 |
+
Text-Classification on the list of texts provided. Classifier provides the
|
77 |
+
most appropriate label for each text. these labels are in terms of if text
|
78 |
+
belongs to which particular Sustainable Devleopment Goal (SDG).
|
79 |
+
|
80 |
+
Params
|
81 |
+
---------
|
82 |
+
haystack_doc: List of haystack Documents. The output of Preprocessing Pipeline
|
83 |
+
contains the list of paragraphs in different format,here the list of
|
84 |
+
Haystack Documents is used.
|
85 |
+
threshold: threshold value for the model to keep the results from classifier
|
86 |
+
classifiermodel: you can pass the classifier model directly,which takes priority
|
87 |
+
however if not then looks for model in streamlit session.
|
88 |
+
In case of streamlit avoid passing the model directly.
|
89 |
+
|
90 |
+
|
91 |
+
Returns
|
92 |
+
----------
|
93 |
+
df: Dataframe with two columns['SDG:int', 'text']
|
94 |
+
x: Series object with the unique SDG covered in the document uploaded and
|
95 |
+
the number of times it is covered/discussed/count_of_paragraphs.
|
96 |
+
|
97 |
+
"""
|
98 |
+
logging.info("Working on SDG Classification")
|
99 |
+
if not classifier_model:
|
100 |
+
if check_streamlit():
|
101 |
+
classifier_model = st.session_state['sdg_classifier']
|
102 |
+
else:
|
103 |
+
logging.warning("No streamlit envinornment found, Pass the classifier")
|
104 |
+
return
|
105 |
+
|
106 |
+
results = classifier_model.predict(haystack_doc)
|
107 |
+
|
108 |
+
|
109 |
+
labels_= [(l.meta['classification']['label'],
|
110 |
+
l.meta['classification']['score'],l.content,) for l in results]
|
111 |
+
|
112 |
+
df = DataFrame(labels_, columns=["SDG","Relevancy","text"])
|
113 |
+
|
114 |
+
df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
115 |
+
df.index += 1
|
116 |
+
df =df[df['Relevancy']>threshold]
|
117 |
+
|
118 |
+
# creating the dataframe for value counts of SDG, along with 'title' of SDGs
|
119 |
+
x = df['SDG'].value_counts()
|
120 |
+
x = x.rename('count')
|
121 |
+
x = x.rename_axis('SDG').reset_index()
|
122 |
+
x["SDG"] = pd.to_numeric(x["SDG"])
|
123 |
+
x = x.sort_values(by=['count'], ascending=False)
|
124 |
+
x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
|
125 |
+
x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
|
126 |
+
|
127 |
+
df['SDG'] = pd.to_numeric(df['SDG'])
|
128 |
+
df = df.sort_values('SDG')
|
129 |
+
|
130 |
+
return df, x
|
131 |
+
|
132 |
+
def runSDGPreprocessingPipeline(file_name:str, file_path:str,
|
133 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
134 |
+
split_length:int = 2, split_respect_sentence_boundary:bool = False,
|
135 |
+
split_overlap:int = 0,remove_punc:bool = False)->List[Document]:
|
136 |
+
"""
|
137 |
+
creates the pipeline and runs the preprocessing pipeline,
|
138 |
+
the params for pipeline are fetched from paramconfig
|
139 |
+
|
140 |
+
Params
|
141 |
+
------------
|
142 |
+
|
143 |
+
file_name: filename, in case of streamlit application use
|
144 |
+
st.session_state['filename']
|
145 |
+
file_path: filepath, in case of streamlit application use st.session_state['filepath']
|
146 |
+
split_by: document splitting strategy either as word or sentence
|
147 |
+
split_length: when synthetically creating the paragrpahs from document,
|
148 |
+
it defines the length of paragraph.
|
149 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
150 |
+
splititng of text.
|
151 |
+
split_overlap: Number of words or sentences that overlap when creating
|
152 |
+
the paragraphs. This is done as one sentence or 'some words' make sense
|
153 |
+
when read in together with others. Therefore the overlap is used.
|
154 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
155 |
+
|
156 |
+
|
157 |
+
Return
|
158 |
+
--------------
|
159 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
160 |
+
has four objects. For the Haysatck implementation of SDG classification we,
|
161 |
+
need to use the List of Haystack Document, which can be fetched by
|
162 |
+
key = 'documents' on output.
|
163 |
+
|
164 |
+
"""
|
165 |
+
|
166 |
+
sdg_processing_pipeline = processingpipeline()
|
167 |
+
|
168 |
+
output_sdg_pre = sdg_processing_pipeline.run(file_paths = file_path,
|
169 |
+
params= {"FileConverter": {"file_path": file_path, \
|
170 |
+
"file_name": file_name},
|
171 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
172 |
+
"split_by": split_by, \
|
173 |
+
"split_length":split_length,\
|
174 |
+
"split_overlap": split_overlap, \
|
175 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
176 |
+
|
177 |
+
return output_sdg_pre
|
utils/semantic_search.py
ADDED
@@ -0,0 +1,582 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from haystack.nodes import TransformersQueryClassifier, Docs2Answers
|
2 |
+
from haystack.nodes import EmbeddingRetriever, FARMReader
|
3 |
+
from haystack.nodes.base import BaseComponent
|
4 |
+
from haystack.document_stores import InMemoryDocumentStore
|
5 |
+
from markdown import markdown
|
6 |
+
from annotated_text import annotation
|
7 |
+
from haystack.schema import Document
|
8 |
+
from typing import List, Text, Union
|
9 |
+
from typing_extensions import Literal
|
10 |
+
from utils.preprocessing import processingpipeline
|
11 |
+
from utils.streamlitcheck import check_streamlit
|
12 |
+
from haystack.pipelines import Pipeline
|
13 |
+
import pandas as pd
|
14 |
+
import logging
|
15 |
+
try:
|
16 |
+
from termcolor import colored
|
17 |
+
except:
|
18 |
+
pass
|
19 |
+
try:
|
20 |
+
import streamlit as st
|
21 |
+
except ImportError:
|
22 |
+
logging.info("Streamlit not installed")
|
23 |
+
|
24 |
+
|
25 |
+
@st.cache(allow_output_mutation=True)
|
26 |
+
def loadQueryClassifier():
|
27 |
+
"""
|
28 |
+
retuns the haystack query classifier model
|
29 |
+
model = shahrukhx01/bert-mini-finetune-question-detection
|
30 |
+
|
31 |
+
"""
|
32 |
+
query_classifier = TransformersQueryClassifier(model_name_or_path=
|
33 |
+
"shahrukhx01/bert-mini-finetune-question-detection")
|
34 |
+
return query_classifier
|
35 |
+
|
36 |
+
class QueryCheck(BaseComponent):
|
37 |
+
"""
|
38 |
+
Uses Query Classifier from Haystack, process the query based on query type.
|
39 |
+
Ability to determine the statements is not so good, therefore the chances
|
40 |
+
statement also get modified. Ex: "List water related issues" will be
|
41 |
+
identified by the model as keywords, and therefore it be processed as "what
|
42 |
+
are the 'list all water related issues' related issues and discussions?".
|
43 |
+
This is one shortcoming but is igonred for now, as semantic search will not
|
44 |
+
get affected a lot, by this. If you want to pass keywords list and want to
|
45 |
+
do batch processing use. run_batch. Example: if you want to find relevant
|
46 |
+
passages for water, food security, poverty then querylist = ["water", "food
|
47 |
+
security","poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
48 |
+
|
49 |
+
1. https://docs.haystack.deepset.ai/docs/query_classifier
|
50 |
+
|
51 |
+
"""
|
52 |
+
|
53 |
+
outgoing_edges = 1
|
54 |
+
|
55 |
+
def run(self, query:str):
|
56 |
+
"""
|
57 |
+
mandatory method to use the custom node. Determines the query type, if
|
58 |
+
if the query is of type keyword/statement will modify it to make it more
|
59 |
+
useful for sentence transoformers.
|
60 |
+
|
61 |
+
Params
|
62 |
+
--------
|
63 |
+
query: query/statement/keywords in form of string
|
64 |
+
|
65 |
+
Return
|
66 |
+
------
|
67 |
+
output: dictionary, with key as identifier and value could be anything
|
68 |
+
we need to return. In this case the output contain key = 'query'.
|
69 |
+
|
70 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
71 |
+
|
72 |
+
"""
|
73 |
+
query_classifier = loadQueryClassifier()
|
74 |
+
result = query_classifier.run(query=query)
|
75 |
+
|
76 |
+
if result[1] == "output_1":
|
77 |
+
output = {"query":query,
|
78 |
+
"query_type": 'question/statement'}
|
79 |
+
else:
|
80 |
+
output = {"query": "what are the {} related issues and \
|
81 |
+
discussions?".format(query),
|
82 |
+
"query_type": 'statements/keyword'}
|
83 |
+
logging.info(output)
|
84 |
+
return output, "output_1"
|
85 |
+
|
86 |
+
def run_batch(self, queries:List[str]):
|
87 |
+
"""
|
88 |
+
running multiple queries in one go, howeevr need the queries to be passed
|
89 |
+
as list of string. Example: if you want to find relevant passages for
|
90 |
+
water, food security, poverty then querylist = ["water", "food security",
|
91 |
+
"poverty"] and then execute QueryCheck.run_batch(queries = querylist)
|
92 |
+
|
93 |
+
Params
|
94 |
+
--------
|
95 |
+
queries: queries/statements/keywords in form of string encapsulated
|
96 |
+
within List
|
97 |
+
|
98 |
+
Return
|
99 |
+
------
|
100 |
+
output: dictionary, with key as identifier and value could be anything
|
101 |
+
we need to return. In this case the output contain key = 'queries'.
|
102 |
+
|
103 |
+
output_1: As there is only one outgoing edge, we pass 'output_1' string
|
104 |
+
"""
|
105 |
+
query_classifier = loadQueryClassifier()
|
106 |
+
query_list = []
|
107 |
+
for query in queries:
|
108 |
+
result = query_classifier.run(query=query)
|
109 |
+
if result[1] == "output_1":
|
110 |
+
query_list.append(query)
|
111 |
+
else:
|
112 |
+
query_list.append("what are the {} related issues and \
|
113 |
+
discussions?".format(query))
|
114 |
+
output = {'queries':query_list}
|
115 |
+
logging.info(output)
|
116 |
+
return output, "output_1"
|
117 |
+
|
118 |
+
|
119 |
+
@st.cache(allow_output_mutation=True)
|
120 |
+
def runSemanticPreprocessingPipeline(file_path:str, file_name:str,
|
121 |
+
split_by: Literal["sentence", "word"] = 'sentence',
|
122 |
+
split_length:int = 2, split_overlap:int = 0,
|
123 |
+
split_respect_sentence_boundary:bool = False,
|
124 |
+
remove_punc:bool = False)->List[Document]:
|
125 |
+
"""
|
126 |
+
creates the pipeline and runs the preprocessing pipeline.
|
127 |
+
|
128 |
+
Params
|
129 |
+
------------
|
130 |
+
|
131 |
+
file_name: filename, in case of streamlit application use
|
132 |
+
st.session_state['filename']
|
133 |
+
file_path: filepath, in case of streamlit application use
|
134 |
+
st.session_state['filepath']
|
135 |
+
split_by: document splitting strategy either as word or sentence
|
136 |
+
split_length: when synthetically creating the paragrpahs from document,
|
137 |
+
it defines the length of paragraph.
|
138 |
+
split_overlap: Number of words or sentences that overlap when creating the
|
139 |
+
paragraphs. This is done as one sentence or 'some words' make sense
|
140 |
+
when read in together with others. Therefore the overlap is used.
|
141 |
+
split_respect_sentence_boundary: Used when using 'word' strategy for
|
142 |
+
splititng of text.
|
143 |
+
remove_punc: to remove all Punctuation including ',' and '.' or not
|
144 |
+
|
145 |
+
Return
|
146 |
+
--------------
|
147 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
148 |
+
has four objects. For the Haysatck implementation of semantic search we,
|
149 |
+
need to use the List of Haystack Document, which can be fetched by
|
150 |
+
key = 'documents' on output.
|
151 |
+
|
152 |
+
"""
|
153 |
+
|
154 |
+
semantic_processing_pipeline = processingpipeline()
|
155 |
+
|
156 |
+
output_semantic_pre = semantic_processing_pipeline.run(file_paths = file_path,
|
157 |
+
params= {"FileConverter": {"file_path": file_path, \
|
158 |
+
"file_name": file_name},
|
159 |
+
"UdfPreProcessor": {"remove_punc": remove_punc, \
|
160 |
+
"split_by": split_by, \
|
161 |
+
"split_length":split_length,\
|
162 |
+
"split_overlap": split_overlap,
|
163 |
+
"split_respect_sentence_boundary":split_respect_sentence_boundary}})
|
164 |
+
|
165 |
+
return output_semantic_pre
|
166 |
+
|
167 |
+
|
168 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
169 |
+
allow_output_mutation=True)
|
170 |
+
def loadRetriever(embedding_model:Text=None, embedding_model_format:Text = None,
|
171 |
+
embedding_layer:int = None, retriever_top_k:int = 10,
|
172 |
+
max_seq_len:int=512, document_store:InMemoryDocumentStore=None):
|
173 |
+
"""
|
174 |
+
Returns the Retriever model based on params provided.
|
175 |
+
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
176 |
+
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
177 |
+
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
178 |
+
|
179 |
+
|
180 |
+
Params
|
181 |
+
---------
|
182 |
+
embedding_model: Name of the model to be used for embedding. Check the links
|
183 |
+
provided in documentation
|
184 |
+
embedding_model_format: check the github link of Haystack provided in
|
185 |
+
documentation embedding_layer: check the github link of Haystack
|
186 |
+
provided in documentation retriever_top_k: Number of Top results to
|
187 |
+
be returned by
|
188 |
+
retriever max_seq_len: everymodel has max seq len it can handle, check in
|
189 |
+
model card. Needed to hanlde the edge cases.
|
190 |
+
document_store: InMemoryDocumentStore, write haystack Document list to
|
191 |
+
DocumentStore and pass the same to function call. Can be done using
|
192 |
+
createDocumentStore from utils.
|
193 |
+
|
194 |
+
Return
|
195 |
+
-------
|
196 |
+
retriever: embedding model
|
197 |
+
"""
|
198 |
+
logging.info("loading retriever")
|
199 |
+
if document_store is None:
|
200 |
+
logging.warning("Retriever initialization requires the DocumentStore")
|
201 |
+
return
|
202 |
+
|
203 |
+
retriever = EmbeddingRetriever(
|
204 |
+
embedding_model=embedding_model,top_k = retriever_top_k,
|
205 |
+
document_store = document_store,
|
206 |
+
emb_extraction_layer=embedding_layer, scale_score =True,
|
207 |
+
model_format=embedding_model_format, use_gpu = True,
|
208 |
+
max_seq_len = max_seq_len )
|
209 |
+
if check_streamlit:
|
210 |
+
st.session_state['retriever'] = retriever
|
211 |
+
return retriever
|
212 |
+
|
213 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
214 |
+
allow_output_mutation=True)
|
215 |
+
def createDocumentStore(documents:List[Document], similarity:str = 'dot_product',
|
216 |
+
embedding_dim:int = 768):
|
217 |
+
"""
|
218 |
+
Creates the InMemory Document Store from haystack list of Documents.
|
219 |
+
It is mandatory component for Retriever to work in Haystack frame work.
|
220 |
+
|
221 |
+
Params
|
222 |
+
-------
|
223 |
+
documents: List of haystack document. If using the preprocessing pipeline,
|
224 |
+
can be fetched key = 'documents; on output of preprocessing pipeline.
|
225 |
+
similarity: scoring function, can be either 'cosine' or 'dot_product'
|
226 |
+
embedding_dim: Document store has default value of embedding size = 768, and
|
227 |
+
update_embeddings method of Docstore cannot infer the embedding size of
|
228 |
+
retiever automatically, therefore set this value as per the model card.
|
229 |
+
|
230 |
+
Return
|
231 |
+
-------
|
232 |
+
document_store: InMemory Document Store object type.
|
233 |
+
|
234 |
+
"""
|
235 |
+
document_store = InMemoryDocumentStore(similarity = similarity,
|
236 |
+
embedding_dim = embedding_dim )
|
237 |
+
document_store.write_documents(documents)
|
238 |
+
|
239 |
+
return document_store
|
240 |
+
|
241 |
+
|
242 |
+
@st.cache(hash_funcs={"builtins.SwigPyObject": lambda _: None},
|
243 |
+
allow_output_mutation=True)
|
244 |
+
def semanticSearchPipeline(documents:List[Document], embedding_model:Text = None,
|
245 |
+
embedding_model_format:Text = None,embedding_layer:int = None,
|
246 |
+
embedding_dim:int = 768,retriever_top_k:int = 10,
|
247 |
+
reader_model:str = None, reader_top_k:int = 10,
|
248 |
+
max_seq_len:int =512,useQueryCheck = True,
|
249 |
+
top_k_per_candidate:int = 1):
|
250 |
+
"""
|
251 |
+
creates the semantic search pipeline and document Store object from the
|
252 |
+
list of haystack documents. The top_k for the Reader and Retirever are kept
|
253 |
+
same, so that all the results returned by Retriever are used, however the
|
254 |
+
context is extracted by Reader for each retrieved result. The querycheck is
|
255 |
+
added as node to process the query. This pipeline is suited for keyword search,
|
256 |
+
and to some extent extractive QA purpose. The purpose of Reader is strictly to
|
257 |
+
highlight the context for retrieved result and not for QA, however as stated
|
258 |
+
it can work for QA too in limited sense.
|
259 |
+
There are 4 variants of pipeline it can return
|
260 |
+
1.QueryCheck > Retriever > Reader
|
261 |
+
2.Retriever > Reader
|
262 |
+
3.QueryCheck > Retriever > Docs2Answers : If reader is None,
|
263 |
+
then Doc2answer is used to keep the output of pipeline structurally same.
|
264 |
+
4.Retriever > Docs2Answers
|
265 |
+
|
266 |
+
Links
|
267 |
+
|
268 |
+
1. https://docs.haystack.deepset.ai/docs/retriever#embedding-retrieval-recommended
|
269 |
+
2. https://www.sbert.net/examples/applications/semantic-search/README.html
|
270 |
+
3. https://github.com/deepset-ai/haystack/blob/main/haystack/nodes/retriever/dense.py
|
271 |
+
4. https://docs.haystack.deepset.ai/docs/reader
|
272 |
+
|
273 |
+
|
274 |
+
Params
|
275 |
+
----------
|
276 |
+
documents: list of Haystack Documents, returned by preprocessig pipeline.
|
277 |
+
embedding_model: Name of the model to be used for embedding. Check the links
|
278 |
+
provided in documentation
|
279 |
+
embedding_model_format: check the github link of Haystack provided in
|
280 |
+
documentation
|
281 |
+
embedding_layer: check the github link of Haystack provided in documentation
|
282 |
+
embedding_dim: Document store has default value of embedding size = 768, and
|
283 |
+
update_embeddings method of Docstore cannot infer the embedding size of
|
284 |
+
retiever automatically, therefore set this value as per the model card.
|
285 |
+
retriever_top_k: Number of Top results to be returned by retriever
|
286 |
+
reader_model: Name of the model to be used for Reader node in hasyatck
|
287 |
+
Pipeline. Check the links provided in documentation
|
288 |
+
reader_top_k: Reader will use retrieved results to further find better matches.
|
289 |
+
As purpose here is to use reader to extract context, the value is
|
290 |
+
same as retriever_top_k.
|
291 |
+
max_seq_len:everymodel has max seq len it can handle, check in model card.
|
292 |
+
Needed to hanlde the edge cases
|
293 |
+
useQueryCheck: Whether to use the querycheck which modifies the query or not.
|
294 |
+
top_k_per_candidate:How many answers to extract for each candidate doc
|
295 |
+
that is coming from the retriever
|
296 |
+
|
297 |
+
Return
|
298 |
+
---------
|
299 |
+
semanticsearch_pipeline: Haystack Pipeline object, with all the necessary
|
300 |
+
nodes [QueryCheck, Retriever, Reader/Docs2Answer]. If reader is None,
|
301 |
+
then Doc2answer is used to keep the output of pipeline structurally
|
302 |
+
same.
|
303 |
+
|
304 |
+
document_store: As retriever can work only with Haystack Document Store, the
|
305 |
+
list of document returned by preprocessing pipeline are fed into to
|
306 |
+
get InMemmoryDocumentStore object type, with retriever updating the
|
307 |
+
embeddings of each paragraph in document store.
|
308 |
+
|
309 |
+
"""
|
310 |
+
document_store = createDocumentStore(documents=documents,
|
311 |
+
embedding_dim=embedding_dim)
|
312 |
+
retriever = loadRetriever(embedding_model = embedding_model,
|
313 |
+
embedding_model_format=embedding_model_format,
|
314 |
+
embedding_layer=embedding_layer,
|
315 |
+
retriever_top_k= retriever_top_k,
|
316 |
+
document_store = document_store,
|
317 |
+
max_seq_len=max_seq_len)
|
318 |
+
document_store.update_embeddings(retriever)
|
319 |
+
semantic_search_pipeline = Pipeline()
|
320 |
+
if useQueryCheck and reader_model:
|
321 |
+
querycheck = QueryCheck()
|
322 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
323 |
+
top_k = reader_top_k, use_gpu=True,
|
324 |
+
top_k_per_candidate = top_k_per_candidate)
|
325 |
+
semantic_search_pipeline.add_node(component = querycheck,
|
326 |
+
name = "QueryCheck",inputs = ["Query"])
|
327 |
+
semantic_search_pipeline.add_node(component = retriever,
|
328 |
+
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
329 |
+
semantic_search_pipeline.add_node(component = reader, name = "FARMReader",
|
330 |
+
inputs= ["EmbeddingRetriever"])
|
331 |
+
|
332 |
+
elif reader_model :
|
333 |
+
reader = FARMReader(model_name_or_path=reader_model,
|
334 |
+
top_k = reader_top_k, use_gpu=True,
|
335 |
+
top_k_per_candidate = top_k_per_candidate)
|
336 |
+
semantic_search_pipeline.add_node(component = retriever,
|
337 |
+
name = "EmbeddingRetriever",inputs = ["Query"])
|
338 |
+
semantic_search_pipeline.add_node(component = reader,
|
339 |
+
name = "FARMReader",inputs= ["EmbeddingRetriever"])
|
340 |
+
elif useQueryCheck and not reader_model:
|
341 |
+
querycheck = QueryCheck()
|
342 |
+
docs2answers = Docs2Answers()
|
343 |
+
semantic_search_pipeline.add_node(component = querycheck,
|
344 |
+
name = "QueryCheck",inputs = ["Query"])
|
345 |
+
semantic_search_pipeline.add_node(component = retriever,
|
346 |
+
name = "EmbeddingRetriever",inputs = ["QueryCheck.output_1"])
|
347 |
+
semantic_search_pipeline.add_node(component = docs2answers,
|
348 |
+
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
349 |
+
elif not useQueryCheck and not reader_model:
|
350 |
+
docs2answers = Docs2Answers()
|
351 |
+
semantic_search_pipeline.add_node(component = retriever,
|
352 |
+
name = "EmbeddingRetriever",inputs = ["Query"])
|
353 |
+
semantic_search_pipeline.add_node(component = docs2answers,
|
354 |
+
name = "Docs2Answers",inputs= ["EmbeddingRetriever"])
|
355 |
+
|
356 |
+
logging.info(semantic_search_pipeline.components)
|
357 |
+
return semantic_search_pipeline, document_store
|
358 |
+
|
359 |
+
def runSemanticPipeline(pipeline:Pipeline, queries:Union[list,str])->dict:
|
360 |
+
"""
|
361 |
+
will use the haystack run or run_batch based on if single query is passed
|
362 |
+
as string or multiple queries as List[str]
|
363 |
+
|
364 |
+
Params
|
365 |
+
-------
|
366 |
+
pipeline: haystack pipeline, this is same as returned by semanticSearchPipeline
|
367 |
+
from utils.semanticsearch
|
368 |
+
|
369 |
+
queries: Either a single query or list of queries.
|
370 |
+
|
371 |
+
Return
|
372 |
+
-------
|
373 |
+
results: Dict containing answers and documents as key and their respective
|
374 |
+
values
|
375 |
+
|
376 |
+
"""
|
377 |
+
|
378 |
+
if type(queries) == list:
|
379 |
+
results = pipeline.run_batch(queries=queries)
|
380 |
+
elif type(queries) == str:
|
381 |
+
results = pipeline.run(query=queries)
|
382 |
+
else:
|
383 |
+
logging.info("Please check the input type for the queries")
|
384 |
+
return
|
385 |
+
|
386 |
+
return results
|
387 |
+
|
388 |
+
def process_query_output(results:dict)->pd.DataFrame:
|
389 |
+
"""
|
390 |
+
Returns the dataframe with necessary information like including
|
391 |
+
['query','answer','answer_offset','context_offset','context','content',
|
392 |
+
'reader_score','retriever_score','id',]. This is designed for output given
|
393 |
+
by semantic search pipeline with single query and final node as reader.
|
394 |
+
The output of pipeline having Docs2Answers as final node or multiple queries
|
395 |
+
need to be handled separately. In these other cases, use process_semantic_output
|
396 |
+
from utils.semantic_search which uses this function internally to make one
|
397 |
+
combined dataframe.
|
398 |
+
|
399 |
+
Params
|
400 |
+
---------
|
401 |
+
results: this dictionary should have key,values with
|
402 |
+
keys = [query,answers,documents], however answers is optional.
|
403 |
+
in case of [Doc2Answers as final node], process_semantic_output
|
404 |
+
doesnt return answers thereby setting all values contained in
|
405 |
+
answers to 'None'
|
406 |
+
|
407 |
+
Return
|
408 |
+
--------
|
409 |
+
df: dataframe with all the columns mentioned in function description.
|
410 |
+
|
411 |
+
"""
|
412 |
+
query_text = results['query']
|
413 |
+
if 'answers' in results.keys():
|
414 |
+
answer_dict = {}
|
415 |
+
|
416 |
+
for answer in results['answers']:
|
417 |
+
answer_dict[answer.document_id] = answer.to_dict()
|
418 |
+
else:
|
419 |
+
answer_dict = {}
|
420 |
+
docs = results['documents']
|
421 |
+
df = pd.DataFrame(columns=['query','answer','answer_offset','context_offset',
|
422 |
+
'context','content','reader_score','retriever_score',
|
423 |
+
'id'])
|
424 |
+
for doc in docs:
|
425 |
+
row_list = {}
|
426 |
+
row_list['query'] = query_text
|
427 |
+
row_list['retriever_score'] = doc.score
|
428 |
+
row_list['id'] = doc.id
|
429 |
+
row_list['content'] = doc.content
|
430 |
+
if doc.id in answer_dict.keys():
|
431 |
+
row_list['answer'] = answer_dict[doc.id]['answer']
|
432 |
+
row_list['context'] = answer_dict[doc.id]['context']
|
433 |
+
row_list['reader_score'] = answer_dict[doc.id]['score']
|
434 |
+
answer_offset = answer_dict[doc.id]['offsets_in_document'][0]
|
435 |
+
row_list['answer_offset'] = [answer_offset['start'],answer_offset['end']]
|
436 |
+
start_idx = doc.content.find(row_list['context'])
|
437 |
+
end_idx = start_idx + len(row_list['context'])
|
438 |
+
row_list['context_offset'] = [start_idx, end_idx]
|
439 |
+
else:
|
440 |
+
row_list['answer'] = None
|
441 |
+
row_list['context'] = None
|
442 |
+
row_list['reader_score'] = None
|
443 |
+
row_list['answer_offset'] = None
|
444 |
+
row_list['context_offset'] = None
|
445 |
+
df_dictionary = pd.DataFrame([row_list])
|
446 |
+
df = pd.concat([df, df_dictionary], ignore_index=True)
|
447 |
+
|
448 |
+
return df
|
449 |
+
|
450 |
+
def process_semantic_output(results):
|
451 |
+
"""
|
452 |
+
Returns the dataframe with necessary information like including
|
453 |
+
['query','answer','answer_offset','context_offset','context','content',
|
454 |
+
'reader_score','retriever_score','id',]. Distingushes if its single query or
|
455 |
+
multi queries by reading the pipeline output dictionary keys.
|
456 |
+
Uses the process_query_output to get the dataframe for each query and create
|
457 |
+
one concataneted dataframe. In case of Docs2Answers as final node, deletes
|
458 |
+
the answers part. See documentations of process_query_output.
|
459 |
+
|
460 |
+
Params
|
461 |
+
---------
|
462 |
+
results: raw output of runSemanticPipeline.
|
463 |
+
|
464 |
+
Return
|
465 |
+
--------
|
466 |
+
df: dataframe with all the columns mentioned in function description.
|
467 |
+
|
468 |
+
"""
|
469 |
+
output = {}
|
470 |
+
if 'query' in results.keys():
|
471 |
+
output['query'] = results['query']
|
472 |
+
output['documents'] = results['documents']
|
473 |
+
if results['node_id'] == 'Docs2Answers':
|
474 |
+
pass
|
475 |
+
else:
|
476 |
+
output['answers'] = results['answers']
|
477 |
+
df = process_query_output(output)
|
478 |
+
return df
|
479 |
+
if 'queries' in results.keys():
|
480 |
+
df = pd.DataFrame(columns=['query','answer','answer_offset',
|
481 |
+
'context_offset','context','content',
|
482 |
+
'reader_score','retriever_score','id'])
|
483 |
+
for query,answers,documents in zip(results['queries'],
|
484 |
+
results['answers'],results['documents']):
|
485 |
+
output = {}
|
486 |
+
output['query'] = query
|
487 |
+
output['documents'] = documents
|
488 |
+
if results['node_id'] == 'Docs2Answers':
|
489 |
+
pass
|
490 |
+
else:
|
491 |
+
output['answers'] = answers
|
492 |
+
|
493 |
+
temp = process_query_output(output)
|
494 |
+
df = pd.concat([df, temp], ignore_index=True)
|
495 |
+
|
496 |
+
|
497 |
+
return df
|
498 |
+
|
499 |
+
def semanticsearchAnnotator(matches:List[List[int]], document:Text):
|
500 |
+
"""
|
501 |
+
Annotates the text in the document defined by list of [start index, end index]
|
502 |
+
Example: "How are you today", if document type is text, matches = [[0,3]]
|
503 |
+
will give answer = "How", however in case we used the spacy matcher then the
|
504 |
+
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
505 |
+
to find "How" then the matches = [[0,1]] for the string defined above.
|
506 |
+
|
507 |
+
"""
|
508 |
+
start = 0
|
509 |
+
annotated_text = ""
|
510 |
+
for match in matches:
|
511 |
+
start_idx = match[0]
|
512 |
+
end_idx = match[1]
|
513 |
+
if check_streamlit():
|
514 |
+
annotated_text = (annotated_text + document[start:start_idx]
|
515 |
+
+ str(annotation(body=document[start_idx:end_idx],
|
516 |
+
label="Context", background="#964448", color='#ffffff')))
|
517 |
+
else:
|
518 |
+
annotated_text = (annotated_text + document[start:start_idx]
|
519 |
+
+ colored(document[start_idx:end_idx],
|
520 |
+
"green", attrs = ['bold']))
|
521 |
+
start = end_idx
|
522 |
+
|
523 |
+
annotated_text = annotated_text + document[end_idx:]
|
524 |
+
|
525 |
+
if check_streamlit():
|
526 |
+
|
527 |
+
st.write(
|
528 |
+
markdown(annotated_text),
|
529 |
+
unsafe_allow_html=True,
|
530 |
+
)
|
531 |
+
else:
|
532 |
+
print(annotated_text)
|
533 |
+
|
534 |
+
|
535 |
+
def semantic_keywordsearch(query:Text,documents:List[Document],
|
536 |
+
embedding_model:Text,
|
537 |
+
embedding_model_format:Text,
|
538 |
+
embedding_layer:int, reader_model:str,
|
539 |
+
retriever_top_k:int = 10, reader_top_k:int = 10,
|
540 |
+
return_results:bool = False, embedding_dim:int = 768,
|
541 |
+
max_seq_len:int = 512,top_k_per_candidate:int =1,
|
542 |
+
sort_by:Literal["retriever", "reader"] = 'retriever'):
|
543 |
+
"""
|
544 |
+
Performs the Semantic search on the List of haystack documents which is
|
545 |
+
returned by preprocessing Pipeline.
|
546 |
+
|
547 |
+
Params
|
548 |
+
-------
|
549 |
+
query: Keywords that need to be searche in documents.
|
550 |
+
documents: List fo Haystack documents returned by preprocessing pipeline.
|
551 |
+
|
552 |
+
"""
|
553 |
+
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = documents,
|
554 |
+
embedding_model= embedding_model,
|
555 |
+
embedding_layer= embedding_layer,
|
556 |
+
embedding_model_format= embedding_model_format,
|
557 |
+
reader_model= reader_model, retriever_top_k= retriever_top_k,
|
558 |
+
reader_top_k= reader_top_k, embedding_dim=embedding_dim,
|
559 |
+
max_seq_len=max_seq_len,
|
560 |
+
top_k_per_candidate=top_k_per_candidate)
|
561 |
+
|
562 |
+
raw_output = runSemanticPipeline(semanticsearch_pipeline,query)
|
563 |
+
results_df = process_semantic_output(raw_output)
|
564 |
+
if sort_by == 'retriever':
|
565 |
+
results_df = results_df.sort_values(by=['retriever_score'], ascending=False)
|
566 |
+
else:
|
567 |
+
results_df = results_df.sort_values(by=['reader_score'], ascending=False)
|
568 |
+
|
569 |
+
if return_results:
|
570 |
+
return results_df
|
571 |
+
else:
|
572 |
+
if check_streamlit:
|
573 |
+
st.markdown("##### Top few semantic search results #####")
|
574 |
+
else:
|
575 |
+
print("Top few semantic search results")
|
576 |
+
for i in range(len(results_df)):
|
577 |
+
if check_streamlit:
|
578 |
+
st.write("Result {}".format(i+1))
|
579 |
+
else:
|
580 |
+
print("Result {}".format(i+1))
|
581 |
+
semanticsearchAnnotator([results_df.loc[i]['context_offset']],
|
582 |
+
results_df.loc[i]['content'] )
|
utils/streamlitcheck.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
try:
|
3 |
+
import streamlit as st
|
4 |
+
except ImportError:
|
5 |
+
logging.info("Streamlit not installed")
|
6 |
+
|
7 |
+
|
8 |
+
def check_streamlit():
|
9 |
+
"""
|
10 |
+
Function to check whether python code is run within streamlit
|
11 |
+
|
12 |
+
Returns
|
13 |
+
-------
|
14 |
+
use_streamlit : boolean
|
15 |
+
True if code is run within streamlit, else False
|
16 |
+
"""
|
17 |
+
try:
|
18 |
+
from streamlit.scriptrunner.script_run_context import get_script_run_ctx
|
19 |
+
if not get_script_run_ctx():
|
20 |
+
use_streamlit = False
|
21 |
+
else:
|
22 |
+
use_streamlit = True
|
23 |
+
except ModuleNotFoundError:
|
24 |
+
use_streamlit = False
|
25 |
+
return use_streamlit
|
26 |
+
|
27 |
+
def disable_other_checkboxes(*other_checkboxes_keys):
|
28 |
+
for checkbox_key in other_checkboxes_keys:
|
29 |
+
st.session_state[checkbox_key] = False
|
30 |
+
|
31 |
+
def checkbox_without_preselect(keylist):
|
32 |
+
dict_ = {}
|
33 |
+
for i,key_val in enumerate(keylist):
|
34 |
+
dict_[i] = st.checkbox(key_val,key = key_val,
|
35 |
+
on_change = disable_other_checkboxes,
|
36 |
+
args=tuple(list(filter(lambda x: x!= key_val, keylist))),)
|
37 |
+
|
38 |
+
for key,val in dict_.items():
|
39 |
+
if val == True:
|
40 |
+
return keylist[int(key)]
|
41 |
+
|
42 |
+
return None
|
utils/uploadAndExample.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile
|
3 |
+
import json
|
4 |
+
|
5 |
+
def add_upload(choice):
|
6 |
+
"""
|
7 |
+
Provdies the user with choice to either 'Upload Document' or 'Try Example'.
|
8 |
+
Based on user choice runs streamlit processes and save the path and name of
|
9 |
+
the 'file' to streamlit session_state which then can be fetched later.
|
10 |
+
|
11 |
+
"""
|
12 |
+
|
13 |
+
if choice == 'Upload Document':
|
14 |
+
uploaded_file = st.sidebar.file_uploader('Upload the File',
|
15 |
+
type=['pdf', 'docx', 'txt'])
|
16 |
+
if uploaded_file is not None:
|
17 |
+
with tempfile.NamedTemporaryFile(mode="wb", delete = False) as temp:
|
18 |
+
bytes_data = uploaded_file.getvalue()
|
19 |
+
temp.write(bytes_data)
|
20 |
+
st.session_state['filename'] = uploaded_file.name
|
21 |
+
st.session_state['filepath'] = temp.name
|
22 |
+
|
23 |
+
|
24 |
+
else:
|
25 |
+
# listing the options
|
26 |
+
with open('docStore/sample/files.json','r') as json_file:
|
27 |
+
files = json.load(json_file)
|
28 |
+
|
29 |
+
option = st.sidebar.selectbox('Select the example document',
|
30 |
+
list(files.keys()))
|
31 |
+
file_name = file_path = files[option]
|
32 |
+
st.session_state['filename'] = file_name
|
33 |
+
st.session_state['filepath'] = file_path
|
ver0.1 scripts/cleaning.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import string
|
5 |
+
import nltk
|
6 |
+
import spacy
|
7 |
+
import en_core_web_sm
|
8 |
+
import re
|
9 |
+
import streamlit as st
|
10 |
+
|
11 |
+
from haystack.nodes import PreProcessor
|
12 |
+
|
13 |
+
'''basic cleaning - suitable for transformer models'''
|
14 |
+
def basic(s,SDG = False):
|
15 |
+
"""
|
16 |
+
:param s: string to be processed
|
17 |
+
:return: processed string: see comments in the source code for more info
|
18 |
+
"""
|
19 |
+
# Text Lowercase
|
20 |
+
#s = s.lower()
|
21 |
+
# Remove punctuation
|
22 |
+
#translator = str.maketrans(' ', ' ', string.punctuation)
|
23 |
+
#s = s.translate(translator)
|
24 |
+
# Remove URLs
|
25 |
+
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
26 |
+
s = re.sub(r"http\S+", " ", s)
|
27 |
+
if SDG == True:
|
28 |
+
s = s.lower()
|
29 |
+
translator = str.maketrans(' ', ' ', string.punctuation)
|
30 |
+
s = s.translate(translator)
|
31 |
+
s = re.sub('\n', ' ', s)
|
32 |
+
s = re.sub("\'", " ", s)
|
33 |
+
s = re.sub(r'\d+', ' ', s)
|
34 |
+
s = re.sub(r'\W+', ' ', s)
|
35 |
+
|
36 |
+
# Remove new line characters
|
37 |
+
#s = re.sub('\n', ' ', s)
|
38 |
+
|
39 |
+
# Remove distracting single quotes
|
40 |
+
#s = re.sub("\'", " ", s)
|
41 |
+
# Remove all remaining numbers and non alphanumeric characters
|
42 |
+
#s = re.sub(r'\d+', ' ', s)
|
43 |
+
#s = re.sub(r'\W+', ' ', s)
|
44 |
+
|
45 |
+
# define custom words to replace:
|
46 |
+
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
|
47 |
+
|
48 |
+
return s.strip()
|
49 |
+
|
50 |
+
|
51 |
+
def preprocessingForSDG(document):
|
52 |
+
|
53 |
+
"""
|
54 |
+
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
|
55 |
+
|
56 |
+
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
|
57 |
+
list that contains all text joined together.
|
58 |
+
"""
|
59 |
+
|
60 |
+
preprocessor = PreProcessor(
|
61 |
+
clean_empty_lines=True,
|
62 |
+
clean_whitespace=True,
|
63 |
+
clean_header_footer=True,
|
64 |
+
split_by="word",
|
65 |
+
split_length=120,
|
66 |
+
split_respect_sentence_boundary=False,
|
67 |
+
#split_overlap=1
|
68 |
+
)
|
69 |
+
for i in document:
|
70 |
+
docs_processed = preprocessor.process([i])
|
71 |
+
for item in docs_processed:
|
72 |
+
item.content = basic(item.content, SDG = True)
|
73 |
+
|
74 |
+
with st.spinner("👑 document being splitted into paragraphs"):
|
75 |
+
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
|
76 |
+
|
77 |
+
# create dataframe of text and list of all text
|
78 |
+
df = pd.DataFrame(docs_processed)
|
79 |
+
all_text = " ".join(df.content.to_list())
|
80 |
+
par_list = df.content.to_list()
|
81 |
+
|
82 |
+
return docs_processed, df, all_text, par_list
|
83 |
+
|
84 |
+
def preprocessing(document):
|
85 |
+
|
86 |
+
"""
|
87 |
+
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
|
88 |
+
|
89 |
+
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
|
90 |
+
list that contains all text joined together.
|
91 |
+
"""
|
92 |
+
|
93 |
+
preprocessor = PreProcessor(
|
94 |
+
clean_empty_lines=True,
|
95 |
+
clean_whitespace=True,
|
96 |
+
clean_header_footer=True,
|
97 |
+
split_by="sentence",
|
98 |
+
split_length=3,
|
99 |
+
split_respect_sentence_boundary=False,
|
100 |
+
split_overlap=1
|
101 |
+
)
|
102 |
+
for i in document:
|
103 |
+
docs_processed = preprocessor.process([i])
|
104 |
+
for item in docs_processed:
|
105 |
+
item.content = basic(item.content)
|
106 |
+
|
107 |
+
with st.spinner("👑 document being splitted into paragraphs"):
|
108 |
+
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
|
109 |
+
|
110 |
+
# create dataframe of text and list of all text
|
111 |
+
df = pd.DataFrame(docs_processed)
|
112 |
+
all_text = " ".join(df.content.to_list())
|
113 |
+
par_list = df.content.to_list()
|
114 |
+
|
115 |
+
return docs_processed, df, all_text, par_list
|
116 |
+
|
117 |
+
'''processing with spacy - suitable for models such as tf-idf, word2vec'''
|
118 |
+
def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
|
119 |
+
|
120 |
+
"""
|
121 |
+
|
122 |
+
Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
|
123 |
+
|
124 |
+
filters out all but proper nouns, nounts, verbs and adjectives.
|
125 |
+
|
126 |
+
Parameters
|
127 |
+
----------
|
128 |
+
alpha : str
|
129 |
+
|
130 |
+
The input string.
|
131 |
+
|
132 |
+
use_nlp : bool, default False
|
133 |
+
|
134 |
+
Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
|
135 |
+
|
136 |
+
Should be set to False if used inside nlp.pipeline
|
137 |
+
|
138 |
+
Returns
|
139 |
+
-------
|
140 |
+
' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
|
141 |
+
|
142 |
+
Notes
|
143 |
+
-----
|
144 |
+
Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
|
145 |
+
Use together with nlp.pipeline for batch processing.
|
146 |
+
|
147 |
+
"""
|
148 |
+
|
149 |
+
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
|
150 |
+
|
151 |
+
if use_nlp:
|
152 |
+
|
153 |
+
alpha = nlp(alpha)
|
154 |
+
|
155 |
+
|
156 |
+
|
157 |
+
beta = []
|
158 |
+
|
159 |
+
for tok in alpha:
|
160 |
+
|
161 |
+
if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
|
162 |
+
|
163 |
+
beta.append(tok.lemma_)
|
164 |
+
|
165 |
+
|
166 |
+
text = ' '.join(beta)
|
167 |
+
text = text.lower()
|
168 |
+
return text
|
ver0.1 scripts/coherence.py
ADDED
@@ -0,0 +1,267 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys; sys.path.append('../udfPreprocess')
|
3 |
+
|
4 |
+
#import helper
|
5 |
+
import udfPreprocess.docPreprocessing as pre
|
6 |
+
import udfPreprocess.cleaning as clean
|
7 |
+
|
8 |
+
#import needed libraries
|
9 |
+
import seaborn as sns
|
10 |
+
from pandas import DataFrame
|
11 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
12 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
13 |
+
# from keybert import KeyBERT
|
14 |
+
from transformers import pipeline
|
15 |
+
import matplotlib.pyplot as plt
|
16 |
+
import numpy as np
|
17 |
+
import streamlit as st
|
18 |
+
import pandas as pd
|
19 |
+
from rank_bm25 import BM25Okapi
|
20 |
+
from sklearn.feature_extraction import _stop_words
|
21 |
+
import string
|
22 |
+
from tqdm.autonotebook import tqdm
|
23 |
+
import numpy as np
|
24 |
+
import urllib.request
|
25 |
+
import ast
|
26 |
+
import tempfile
|
27 |
+
import sqlite3
|
28 |
+
import json
|
29 |
+
import urllib.request
|
30 |
+
import ast
|
31 |
+
import docx
|
32 |
+
from docx.shared import Inches
|
33 |
+
from docx.shared import Pt
|
34 |
+
from docx.enum.style import WD_STYLE_TYPE
|
35 |
+
|
36 |
+
def app():
|
37 |
+
# Sidebar
|
38 |
+
st.sidebar.title('Check Coherence')
|
39 |
+
st.sidebar.write(' ')
|
40 |
+
with open('ndcs/countryList.txt') as dfile:
|
41 |
+
countryList = dfile.read()
|
42 |
+
|
43 |
+
countryList = ast.literal_eval(countryList)
|
44 |
+
countrynames = list(countryList.keys())
|
45 |
+
|
46 |
+
option = st.sidebar.selectbox('Select Country', (countrynames))
|
47 |
+
countryCode = countryList[option]
|
48 |
+
|
49 |
+
|
50 |
+
with st.container():
|
51 |
+
st.markdown("<h1 style='text-align: center; color: black;'> Check Coherence of Policy Document with NDCs</h1>", unsafe_allow_html=True)
|
52 |
+
st.write(' ')
|
53 |
+
st.write(' ')
|
54 |
+
|
55 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
56 |
+
|
57 |
+
st.write(
|
58 |
+
"""
|
59 |
+
The *Check Coherence* app is an easy-to-use interface built in Streamlit for doing analysis of policy document and finding the coherence between NDCs/New-Updated NDCs- developed by GIZ Data and the Sustainable Development Solution Network.
|
60 |
+
"""
|
61 |
+
)
|
62 |
+
|
63 |
+
st.markdown("")
|
64 |
+
|
65 |
+
st.markdown("")
|
66 |
+
st.markdown("## 📌 Step One: Upload document of the country selected ")
|
67 |
+
|
68 |
+
with st.container():
|
69 |
+
docs = None
|
70 |
+
# asking user for either upload or select existing doc
|
71 |
+
choice = st.radio(label = 'Select the Document',
|
72 |
+
help = 'You can upload the document \
|
73 |
+
or else you can try a example document.',
|
74 |
+
options = ('Upload Document', 'Try Example'),
|
75 |
+
horizontal = True)
|
76 |
+
|
77 |
+
if choice == 'Upload Document':
|
78 |
+
uploaded_file = st.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
79 |
+
if uploaded_file is not None:
|
80 |
+
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
81 |
+
bytes_data = uploaded_file.getvalue()
|
82 |
+
temp.write(bytes_data)
|
83 |
+
|
84 |
+
st.write("Uploaded Filename: ", uploaded_file.name)
|
85 |
+
file_name = uploaded_file.name
|
86 |
+
file_path = temp.name
|
87 |
+
docs = pre.load_document(file_path, file_name)
|
88 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
89 |
+
|
90 |
+
else:
|
91 |
+
# listing the options
|
92 |
+
option = st.selectbox('Select the example document',
|
93 |
+
('South Africa:Low Emission strategy',
|
94 |
+
'Ethiopia: 10 Year Development Plan'))
|
95 |
+
if option is 'South Africa:Low Emission strategy':
|
96 |
+
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
97 |
+
countryCode = countryList['South Africa']
|
98 |
+
st.write("Selected document:", file_name.split('/')[1])
|
99 |
+
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
100 |
+
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
101 |
+
else:
|
102 |
+
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
103 |
+
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
104 |
+
countryCode = countryList['Ethiopia']
|
105 |
+
st.write("Selected document:", file_name.split('/')[1])
|
106 |
+
|
107 |
+
if option is not None:
|
108 |
+
docs = pre.load_document(file_path,file_name)
|
109 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
110 |
+
|
111 |
+
with open('ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
|
112 |
+
cca_sent = dfile.read()
|
113 |
+
|
114 |
+
cca_sent = ast.literal_eval(cca_sent)
|
115 |
+
|
116 |
+
with open('ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
|
117 |
+
ccm_sent = dfile.read()
|
118 |
+
|
119 |
+
ccm_sent = ast.literal_eval(ccm_sent)
|
120 |
+
|
121 |
+
with open('ndcs/countryList.txt') as dfile:
|
122 |
+
countryList = dfile.read()
|
123 |
+
|
124 |
+
countryList = ast.literal_eval(countryList)
|
125 |
+
|
126 |
+
def get_document(countryCode: str):
|
127 |
+
link = "https://klimalog.die-gdi.de/ndc/open-data/dataset.json"
|
128 |
+
with urllib.request.urlopen(link) as urlfile:
|
129 |
+
data = json.loads(urlfile.read())
|
130 |
+
categoriesData = {}
|
131 |
+
categoriesData['categories']= data['categories']
|
132 |
+
categoriesData['subcategories']= data['subcategories']
|
133 |
+
keys_sub = categoriesData['subcategories'].keys()
|
134 |
+
documentType= 'NDCs'
|
135 |
+
if documentType in data.keys():
|
136 |
+
if countryCode in data[documentType].keys():
|
137 |
+
get_dict = {}
|
138 |
+
for key, value in data[documentType][countryCode].items():
|
139 |
+
if key not in ['country_name','region_id', 'region_name']:
|
140 |
+
get_dict[key] = value['classification']
|
141 |
+
else:
|
142 |
+
get_dict[key] = value
|
143 |
+
else:
|
144 |
+
return None
|
145 |
+
else:
|
146 |
+
return None
|
147 |
+
|
148 |
+
country = {}
|
149 |
+
for key in categoriesData['categories']:
|
150 |
+
country[key]= {}
|
151 |
+
for key,value in categoriesData['subcategories'].items():
|
152 |
+
country[value['category']][key] = get_dict[key]
|
153 |
+
|
154 |
+
return country
|
155 |
+
|
156 |
+
# country_ndc = get_document('NDCs', countryList[option])
|
157 |
+
|
158 |
+
def countrySpecificCCA(cca_sent, threshold, countryCode):
|
159 |
+
temp = {}
|
160 |
+
doc = get_document(countryCode)
|
161 |
+
for key,value in cca_sent.items():
|
162 |
+
id_ = doc['climate change adaptation'][key]['id']
|
163 |
+
if id_ >threshold:
|
164 |
+
temp[key] = value['id'][id_]
|
165 |
+
return temp
|
166 |
+
|
167 |
+
|
168 |
+
def countrySpecificCCM(ccm_sent, threshold, countryCode):
|
169 |
+
temp = {}
|
170 |
+
doc = get_document(countryCode)
|
171 |
+
for key,value in ccm_sent.items():
|
172 |
+
id_ = doc['climate change mitigation'][key]['id']
|
173 |
+
if id_ >threshold:
|
174 |
+
temp[key] = value['id'][id_]
|
175 |
+
|
176 |
+
return temp
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
if docs is not None:
|
181 |
+
sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
|
182 |
+
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
|
183 |
+
#st.write(sent_ccm)
|
184 |
+
@st.cache(allow_output_mutation=True)
|
185 |
+
def load_sentenceTransformer(name):
|
186 |
+
return SentenceTransformer(name)
|
187 |
+
model = load_sentenceTransformer('all-MiniLM-L6-v2')
|
188 |
+
|
189 |
+
document_embeddings = model.encode(paraList, show_progress_bar=True)
|
190 |
+
|
191 |
+
genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
|
192 |
+
if genre == 'Climate Change Adaptation':
|
193 |
+
sent_dict = sent_cca
|
194 |
+
sent_labels = []
|
195 |
+
for key,sent in sent_dict.items():
|
196 |
+
sent_labels.append(sent)
|
197 |
+
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
198 |
+
similarity_high_threshold = 0.55
|
199 |
+
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
200 |
+
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
201 |
+
|
202 |
+
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
203 |
+
|
204 |
+
|
205 |
+
else:
|
206 |
+
sent_dict = sent_ccm
|
207 |
+
sent_labels = []
|
208 |
+
for key,sent in sent_dict.items():
|
209 |
+
sent_labels.append(sent)
|
210 |
+
label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
211 |
+
similarity_high_threshold = 0.55
|
212 |
+
similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
213 |
+
label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
214 |
+
|
215 |
+
positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
216 |
+
|
217 |
+
|
218 |
+
# sent_labels = []
|
219 |
+
# for key,sent in sent_dict.items():
|
220 |
+
# sent_labels.append(sent)
|
221 |
+
|
222 |
+
|
223 |
+
# label_embeddings = model.encode(sent_labels, show_progress_bar=True)
|
224 |
+
|
225 |
+
#similarity_high_threshold = 0.55
|
226 |
+
# similarity_matrix = cosine_similarity(label_embeddings, document_embeddings)
|
227 |
+
#label_indices, paragraph_indices = np.where(similarity_matrix>similarity_high_threshold)
|
228 |
+
|
229 |
+
#positive_indices = list(zip(label_indices.tolist(), paragraph_indices.tolist()))
|
230 |
+
document = docx.Document()
|
231 |
+
document.add_heading('Document name:{}'.format(file_name), 2)
|
232 |
+
section = document.sections[0]
|
233 |
+
|
234 |
+
# Calling the footer
|
235 |
+
footer = section.footer
|
236 |
+
|
237 |
+
# Calling the paragraph already present in
|
238 |
+
# the footer section
|
239 |
+
footer_para = footer.paragraphs[0]
|
240 |
+
|
241 |
+
font_styles = document.styles
|
242 |
+
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
|
243 |
+
font_object = font_charstyle.font
|
244 |
+
font_object.size = Pt(7)
|
245 |
+
# Adding the centered zoned footer
|
246 |
+
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
|
247 |
+
|
248 |
+
document.add_paragraph("Country Code for which NDC is carried out {}".format(countryCode))
|
249 |
+
|
250 |
+
for _label_idx, _paragraph_idx in positive_indices:
|
251 |
+
st.write("This paragraph: \n")
|
252 |
+
document.add_paragraph("This paragraph: \n")
|
253 |
+
st.write(paraList[_paragraph_idx])
|
254 |
+
st.write(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
|
255 |
+
document.add_paragraph(f"Is relevant to: \n {list(sent_dict.keys())[_label_idx]}")
|
256 |
+
st.write('-'*10)
|
257 |
+
document.add_paragraph('-'*10)
|
258 |
+
|
259 |
+
document.save('demo.docx')
|
260 |
+
with open("demo.docx", "rb") as file:
|
261 |
+
btn = st.download_button(
|
262 |
+
label="Download file",
|
263 |
+
data=file,
|
264 |
+
file_name="demo.docx",
|
265 |
+
mime="txt/docx"
|
266 |
+
)
|
267 |
+
|
ver0.1 scripts/docPreprocessing.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable, Dict, List, Optional
|
2 |
+
|
3 |
+
from pathlib import Path
|
4 |
+
import re
|
5 |
+
import logging
|
6 |
+
import string
|
7 |
+
import streamlit as st
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
+
import os
|
11 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
12 |
+
|
13 |
+
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
|
14 |
+
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter
|
15 |
+
from haystack.nodes.file_converter import PDFToTextConverter, TextConverter
|
16 |
+
from haystack.schema import Document
|
17 |
+
import pdfplumber
|
18 |
+
|
19 |
+
import pandas as pd
|
20 |
+
|
21 |
+
import tempfile
|
22 |
+
import sqlite3
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
def load_document(
|
27 |
+
file_path: str,
|
28 |
+
file_name,
|
29 |
+
encoding: Optional[str] = None,
|
30 |
+
id_hash_keys: Optional[List[str]] = None,
|
31 |
+
) -> List[Document]:
|
32 |
+
|
33 |
+
"""
|
34 |
+
takes docx, txt and pdf files as input and \
|
35 |
+
extracts text as well as the filename as metadata. \
|
36 |
+
Since haystack does not take care of all pdf files, \
|
37 |
+
pdfplumber is attached to the pipeline in case the pdf \
|
38 |
+
extraction fails via Haystack.
|
39 |
+
|
40 |
+
Returns a list of type haystack.schema.Document
|
41 |
+
"""
|
42 |
+
|
43 |
+
if file_name.endswith('.pdf'):
|
44 |
+
converter = PDFToTextConverter(remove_numeric_tables=True)
|
45 |
+
if file_name.endswith('.txt'):
|
46 |
+
converter = TextConverter()
|
47 |
+
if file_name.endswith('.docx'):
|
48 |
+
converter = DocxToTextConverter()
|
49 |
+
|
50 |
+
|
51 |
+
documents = []
|
52 |
+
logger.info("Converting {}".format(file_name))
|
53 |
+
# PDFToTextConverter, TextConverter, and DocxToTextConverter
|
54 |
+
# return a list containing a single Document
|
55 |
+
document = converter.convert(
|
56 |
+
file_path=file_path, meta=None,
|
57 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
58 |
+
)[0]
|
59 |
+
text = document.content
|
60 |
+
documents.append(Document(content=text,
|
61 |
+
meta={"name": file_name},
|
62 |
+
id_hash_keys=id_hash_keys))
|
63 |
+
|
64 |
+
'''check if text is empty and apply different pdf processor. \
|
65 |
+
This can happen whith certain pdf types.'''
|
66 |
+
for i in documents:
|
67 |
+
if i.content == "":
|
68 |
+
with st.spinner("using pdfplumber"):
|
69 |
+
text = []
|
70 |
+
with pdfplumber.open(file_path) as pdf:
|
71 |
+
for page in pdf.pages:
|
72 |
+
text.append(page.extract_text())
|
73 |
+
i.content = ' '.join([page for page in text])
|
74 |
+
|
75 |
+
return documents
|
ver0.1 scripts/keyword_search.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys
|
3 |
+
from udfPreprocess.search import semantic_search
|
4 |
+
sys.path.append('../udfPreprocess')
|
5 |
+
|
6 |
+
#import helper
|
7 |
+
import udfPreprocess.docPreprocessing as pre
|
8 |
+
import udfPreprocess.cleaning as clean
|
9 |
+
from udfPreprocess.search import bm25_tokenizer, bm25TokenizeDoc, lexical_search
|
10 |
+
#import needed libraries
|
11 |
+
import seaborn as sns
|
12 |
+
from pandas import DataFrame
|
13 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
14 |
+
# from keybert import KeyBERT
|
15 |
+
from transformers import pipeline
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
import numpy as np
|
18 |
+
import streamlit as st
|
19 |
+
import pandas as pd
|
20 |
+
from rank_bm25 import BM25Okapi
|
21 |
+
from sklearn.feature_extraction import _stop_words
|
22 |
+
import string
|
23 |
+
from tqdm.autonotebook import tqdm
|
24 |
+
import numpy as np
|
25 |
+
import docx
|
26 |
+
from docx.shared import Inches
|
27 |
+
from docx.shared import Pt
|
28 |
+
from docx.enum.style import WD_STYLE_TYPE
|
29 |
+
import logging
|
30 |
+
logger = logging.getLogger(__name__)
|
31 |
+
import tempfile
|
32 |
+
import sqlite3
|
33 |
+
import json
|
34 |
+
import configparser
|
35 |
+
|
36 |
+
|
37 |
+
def app():
|
38 |
+
|
39 |
+
with st.container():
|
40 |
+
st.markdown("<h1 style='text-align: center; \
|
41 |
+
color: black;'> Search</h1>",
|
42 |
+
unsafe_allow_html=True)
|
43 |
+
st.write(' ')
|
44 |
+
st.write(' ')
|
45 |
+
|
46 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
47 |
+
|
48 |
+
st.write(
|
49 |
+
"""
|
50 |
+
The *Keyword Search* app is an easy-to-use interface \
|
51 |
+
built in Streamlit for doing keyword search in \
|
52 |
+
policy document - developed by GIZ Data and the \
|
53 |
+
Sustainable Development Solution Network.
|
54 |
+
""")
|
55 |
+
|
56 |
+
st.markdown("")
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
with st.sidebar:
|
61 |
+
with open('sample/keywordexample.json','r') as json_file:
|
62 |
+
keywordexample = json.load(json_file)
|
63 |
+
|
64 |
+
genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
65 |
+
if genre == 'Food':
|
66 |
+
keywordList = keywordexample['Food']
|
67 |
+
elif genre == 'Climate':
|
68 |
+
keywordList = keywordexample['Climate']
|
69 |
+
elif genre == 'Social':
|
70 |
+
keywordList = keywordexample['Social']
|
71 |
+
elif genre == 'Nature':
|
72 |
+
keywordList = keywordexample['Nature']
|
73 |
+
elif genre == 'Implementation':
|
74 |
+
keywordList = keywordexample['Implementation']
|
75 |
+
else:
|
76 |
+
keywordList = None
|
77 |
+
|
78 |
+
searchtype = st.selectbox("Do you want to find exact macthes or similar meaning/context", ['Exact Matches', 'Similar context/meaning'])
|
79 |
+
|
80 |
+
|
81 |
+
with st.container():
|
82 |
+
if keywordList is not None:
|
83 |
+
queryList = st.text_input("You selcted the {} category we will look for these keywords in document".format(genre),
|
84 |
+
value="{}".format(keywordList))
|
85 |
+
else:
|
86 |
+
queryList = st.text_input("Please enter here your question and we will look \
|
87 |
+
for an answer in the document OR enter the keyword you \
|
88 |
+
are looking for and we will \
|
89 |
+
we will look for similar context \
|
90 |
+
in the document.",
|
91 |
+
placeholder="Enter keyword here")
|
92 |
+
|
93 |
+
if st.button("Find them"):
|
94 |
+
|
95 |
+
if queryList == "":
|
96 |
+
st.info("🤔 No keyword provided, if you dont have any, please try example sets from sidebar!")
|
97 |
+
logging.warning("Terminated as no keyword provided")
|
98 |
+
else:
|
99 |
+
|
100 |
+
if 'docs' in st.session_state:
|
101 |
+
docs = st.session_state['docs']
|
102 |
+
paraList = st.session_state['paraList']
|
103 |
+
|
104 |
+
if searchtype == 'Exact Matches':
|
105 |
+
queryList = list(queryList.split(","))
|
106 |
+
logging.info("performing lexical search")
|
107 |
+
tokenized_corpus = bm25TokenizeDoc(paraList)
|
108 |
+
# st.write(len(tokenized_corpus))
|
109 |
+
document_bm25 = BM25Okapi(tokenized_corpus)
|
110 |
+
|
111 |
+
with st.spinner("Performing Exact matching search (Lexical search) for you"):
|
112 |
+
st.markdown("##### Top few lexical search (BM25) hits #####")
|
113 |
+
|
114 |
+
for keyword in queryList:
|
115 |
+
|
116 |
+
bm25_hits = lexical_search(keyword,document_bm25)
|
117 |
+
|
118 |
+
|
119 |
+
counter = 0
|
120 |
+
for hit in bm25_hits:
|
121 |
+
if hit['score'] > 0.00:
|
122 |
+
counter += 1
|
123 |
+
if counter == 1:
|
124 |
+
st.markdown("###### Results for keyword: **{}** ######".format(keyword))
|
125 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
126 |
+
st.write("\t {}: {}\t".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
|
127 |
+
|
128 |
+
|
129 |
+
if counter == 0:
|
130 |
+
st.write("No results found for '**{}**' ".format(keyword))
|
131 |
+
|
132 |
+
st.markdown("---")
|
133 |
+
else:
|
134 |
+
logging.info("starting semantic search")
|
135 |
+
with st.spinner("Performing Similar/Contextual search"):
|
136 |
+
query = "Find {} related issues ?".format(queryList)
|
137 |
+
config = configparser.ConfigParser()
|
138 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
139 |
+
threshold = float(config.get('semantic_search','THRESHOLD'))
|
140 |
+
# st.write(query)
|
141 |
+
semantic_hits = semantic_search(query,paraList)
|
142 |
+
st.markdown("##### Few Semantic search hits for {} related topics #####".format(queryList))
|
143 |
+
|
144 |
+
for i,queryhit in enumerate(semantic_hits):
|
145 |
+
|
146 |
+
# st.markdown("###### Results for query: **{}** ######".format(queryList[i]))
|
147 |
+
counter = 0
|
148 |
+
for hit in queryhit:
|
149 |
+
counter += 1
|
150 |
+
|
151 |
+
|
152 |
+
if hit['score'] > threshold:
|
153 |
+
# st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
154 |
+
st.write("\t {}: \t {}".format(counter, paraList[hit['corpus_id']].replace("\n", " ")))
|
155 |
+
|
156 |
+
# document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
157 |
+
st.markdown("---")
|
158 |
+
# st.write(semantic_hits)
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
else:
|
164 |
+
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
165 |
+
logging.warning("Terminated as no keyword provided")
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
|
ver0.1 scripts/sdg.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob, os, sys;
|
2 |
+
sys.path.append('../udfPreprocess')
|
3 |
+
|
4 |
+
#import helper
|
5 |
+
import udfPreprocess.docPreprocessing as pre
|
6 |
+
import udfPreprocess.cleaning as clean
|
7 |
+
|
8 |
+
#import needed libraries
|
9 |
+
import seaborn as sns
|
10 |
+
from pandas import DataFrame
|
11 |
+
from keybert import KeyBERT
|
12 |
+
from transformers import pipeline
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import numpy as np
|
15 |
+
import streamlit as st
|
16 |
+
import pandas as pd
|
17 |
+
import docx
|
18 |
+
from docx.shared import Inches
|
19 |
+
from docx.shared import Pt
|
20 |
+
from docx.enum.style import WD_STYLE_TYPE
|
21 |
+
|
22 |
+
import tempfile
|
23 |
+
import sqlite3
|
24 |
+
import logging
|
25 |
+
logger = logging.getLogger(__name__)
|
26 |
+
import configparser
|
27 |
+
|
28 |
+
@st.cache(allow_output_mutation=True)
|
29 |
+
def load_sdgClassifier():
|
30 |
+
classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
|
31 |
+
logging.info("Loading classifier")
|
32 |
+
return classifier
|
33 |
+
|
34 |
+
def sdg_classification(par_list):
|
35 |
+
logging.info("running SDG classifiication")
|
36 |
+
config = configparser.ConfigParser()
|
37 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
38 |
+
threshold = float(config.get('sdg','THRESHOLD'))
|
39 |
+
|
40 |
+
|
41 |
+
classifier = load_sdgClassifier()
|
42 |
+
labels = classifier(par_list)
|
43 |
+
|
44 |
+
labels_= [(l['label'],l['score']) for l in labels]
|
45 |
+
# df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
46 |
+
df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
47 |
+
|
48 |
+
df2['text'] = par_list
|
49 |
+
df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
50 |
+
df2.index += 1
|
51 |
+
df2 =df2[df2['Relevancy']>threshold]
|
52 |
+
x = df2['SDG'].value_counts()
|
53 |
+
df3 = df2.copy()
|
54 |
+
df3= df3.drop(['Relevancy'], axis = 1)
|
55 |
+
|
56 |
+
|
57 |
+
return df3, x
|
ver0.1 scripts/sdg_analysis.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# set path
|
2 |
+
import glob, os, sys;
|
3 |
+
sys.path.append('../udfPreprocess')
|
4 |
+
|
5 |
+
#import helper
|
6 |
+
|
7 |
+
|
8 |
+
#import needed libraries
|
9 |
+
import seaborn as sns
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
import numpy as np
|
12 |
+
import streamlit as st
|
13 |
+
import docx
|
14 |
+
from docx.shared import Inches
|
15 |
+
from docx.shared import Pt
|
16 |
+
from docx.enum.style import WD_STYLE_TYPE
|
17 |
+
from udfPreprocess.sdg_classifier import sdg_classification
|
18 |
+
from udfPreprocess.sdg_classifier import runSDGPreprocessingPipeline
|
19 |
+
import configparser
|
20 |
+
import tempfile
|
21 |
+
import sqlite3
|
22 |
+
import logging
|
23 |
+
logger = logging.getLogger(__name__)
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
def app():
|
28 |
+
|
29 |
+
with st.container():
|
30 |
+
st.markdown("<h1 style='text-align: center; color: black;'> SDSN x GIZ Policy Action Tracking v0.1</h1>", unsafe_allow_html=True)
|
31 |
+
st.write(' ')
|
32 |
+
st.write(' ')
|
33 |
+
|
34 |
+
with st.expander("ℹ️ - About this app", expanded=False):
|
35 |
+
|
36 |
+
st.write(
|
37 |
+
"""
|
38 |
+
The *Analyse Policy Document* app is an easy-to-use interface built in Streamlit for analyzing policy documents with respect to SDG Classification for the paragraphs/texts in the document - developed by GIZ Data and the Sustainable Development Solution Network. \n
|
39 |
+
""")
|
40 |
+
st.markdown("")
|
41 |
+
|
42 |
+
|
43 |
+
with st.container():
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
if 'filepath' in st.session_state:
|
48 |
+
paraList = runSDGPreprocessingPipeline()
|
49 |
+
with st.spinner("Running SDG"):
|
50 |
+
|
51 |
+
df, x = sdg_classification(paraList)
|
52 |
+
|
53 |
+
|
54 |
+
# classifier = load_sdgClassifier()
|
55 |
+
|
56 |
+
# labels = classifier(par_list)
|
57 |
+
# labels_= [(l['label'],l['score']) for l in labels]
|
58 |
+
# df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
59 |
+
# df2['text'] = par_list
|
60 |
+
# df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
61 |
+
# df2.index += 1
|
62 |
+
# df2 =df2[df2['Relevancy']>.85]
|
63 |
+
# x = df2['SDG'].value_counts()
|
64 |
+
# df3 = df2.copy()
|
65 |
+
|
66 |
+
plt.rcParams['font.size'] = 25
|
67 |
+
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
68 |
+
# plot
|
69 |
+
fig, ax = plt.subplots()
|
70 |
+
ax.pie(x, colors=colors, radius=2, center=(4, 4),
|
71 |
+
wedgeprops={"linewidth": 1, "edgecolor": "white"}, frame=False,labels =list(x.index))
|
72 |
+
# fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
|
73 |
+
st.markdown("#### Anything related to SDGs? ####")
|
74 |
+
|
75 |
+
# st.markdown("#### 🎈 Anything related to SDGs? ####")
|
76 |
+
|
77 |
+
c4, c5, c6 = st.columns([2, 2, 2])
|
78 |
+
|
79 |
+
# Add styling
|
80 |
+
cmGreen = sns.light_palette("green", as_cmap=True)
|
81 |
+
cmRed = sns.light_palette("red", as_cmap=True)
|
82 |
+
# df2 = df2.style.background_gradient(
|
83 |
+
# cmap=cmGreen,
|
84 |
+
# subset=[
|
85 |
+
# "Relevancy",
|
86 |
+
# ],
|
87 |
+
# )
|
88 |
+
|
89 |
+
# format_dictionary = {
|
90 |
+
# "Relevancy": "{:.1%}",
|
91 |
+
# }
|
92 |
+
|
93 |
+
# df2 = df2.format(format_dictionary)
|
94 |
+
|
95 |
+
with c5:
|
96 |
+
st.pyplot(fig)
|
97 |
+
|
98 |
+
c7, c8, c9 = st.columns([1, 10, 1])
|
99 |
+
with c8:
|
100 |
+
st.table(df)
|
101 |
+
|
102 |
+
|
103 |
+
# 1. Keyword heatmap \n
|
104 |
+
# 2. SDG Classification for the paragraphs/texts in the document
|
105 |
+
#
|
106 |
+
|
107 |
+
# with st.container():
|
108 |
+
# if 'docs' in st.session_state:
|
109 |
+
# docs = st.session_state['docs']
|
110 |
+
# docs_processed, df, all_text, par_list = clean.preprocessingForSDG(docs)
|
111 |
+
# # paraList = st.session_state['paraList']
|
112 |
+
# logging.info("keybert")
|
113 |
+
# with st.spinner("Running Key bert"):
|
114 |
+
|
115 |
+
# kw_model = load_keyBert()
|
116 |
+
|
117 |
+
# keywords = kw_model.extract_keywords(
|
118 |
+
# all_text,
|
119 |
+
# keyphrase_ngram_range=(1, 3),
|
120 |
+
# use_mmr=True,
|
121 |
+
# stop_words="english",
|
122 |
+
# top_n=10,
|
123 |
+
# diversity=0.7,
|
124 |
+
# )
|
125 |
+
|
126 |
+
# st.markdown("## 🎈 What is my document about?")
|
127 |
+
|
128 |
+
# df = (
|
129 |
+
# DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
|
130 |
+
# .sort_values(by="Relevancy", ascending=False)
|
131 |
+
# .reset_index(drop=True)
|
132 |
+
# )
|
133 |
+
# df1 = (
|
134 |
+
# DataFrame(keywords, columns=["Keyword/Keyphrase", "Relevancy"])
|
135 |
+
# .sort_values(by="Relevancy", ascending=False)
|
136 |
+
# .reset_index(drop=True)
|
137 |
+
# )
|
138 |
+
# df.index += 1
|
139 |
+
|
140 |
+
# # Add styling
|
141 |
+
# cmGreen = sns.light_palette("green", as_cmap=True)
|
142 |
+
# cmRed = sns.light_palette("red", as_cmap=True)
|
143 |
+
# df = df.style.background_gradient(
|
144 |
+
# cmap=cmGreen,
|
145 |
+
# subset=[
|
146 |
+
# "Relevancy",
|
147 |
+
# ],
|
148 |
+
# )
|
149 |
+
|
150 |
+
# c1, c2, c3 = st.columns([1, 3, 1])
|
151 |
+
|
152 |
+
# format_dictionary = {
|
153 |
+
# "Relevancy": "{:.1%}",
|
154 |
+
# }
|
155 |
+
|
156 |
+
# df = df.format(format_dictionary)
|
157 |
+
|
158 |
+
# with c2:
|
159 |
+
#
|
160 |
+
# st.table(df)
|
ver0.1 scripts/search.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob, os, sys; sys.path.append('../utils')
|
2 |
+
|
3 |
+
#import needed libraries
|
4 |
+
import seaborn as sns
|
5 |
+
from pandas import DataFrame
|
6 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
7 |
+
# from keybert import KeyBERT
|
8 |
+
from transformers import pipeline
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
import numpy as np
|
11 |
+
import streamlit as st
|
12 |
+
import pandas as pd
|
13 |
+
from rank_bm25 import BM25Okapi
|
14 |
+
from sklearn.feature_extraction import _stop_words
|
15 |
+
import string
|
16 |
+
from tqdm.autonotebook import tqdm
|
17 |
+
import numpy as np
|
18 |
+
import docx
|
19 |
+
from docx.shared import Inches
|
20 |
+
from docx.shared import Pt
|
21 |
+
from docx.enum.style import WD_STYLE_TYPE
|
22 |
+
import logging
|
23 |
+
logger = logging.getLogger(__name__)
|
24 |
+
import tempfile
|
25 |
+
import sqlite3
|
26 |
+
import configparser
|
27 |
+
|
28 |
+
### These are lexcial search related functions #####
|
29 |
+
|
30 |
+
def bm25_tokenizer(text):
|
31 |
+
tokenized_doc = []
|
32 |
+
for token in text.lower().split():
|
33 |
+
token = token.strip(string.punctuation)
|
34 |
+
|
35 |
+
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
36 |
+
tokenized_doc.append(token)
|
37 |
+
return tokenized_doc
|
38 |
+
|
39 |
+
def bm25TokenizeDoc(paraList):
|
40 |
+
tokenized_corpus = []
|
41 |
+
##########Commenting this for now########### will incorporate paragrpah splitting later.
|
42 |
+
# for passage in tqdm(paraList):
|
43 |
+
# if len(passage.split()) >256:
|
44 |
+
# # st.write("Splitting")
|
45 |
+
# temp = " ".join(passage.split()[:256])
|
46 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
47 |
+
# temp = " ".join(passage.split()[256:])
|
48 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
49 |
+
# else:
|
50 |
+
# tokenized_corpus.append(bm25_tokenizer(passage))
|
51 |
+
######################################################################################33333
|
52 |
+
for passage in tqdm(paraList):
|
53 |
+
tokenized_corpus.append(bm25_tokenizer(passage))
|
54 |
+
|
55 |
+
return tokenized_corpus
|
56 |
+
|
57 |
+
def lexical_search(keyword, document_bm25):
|
58 |
+
config = configparser.ConfigParser()
|
59 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
60 |
+
top_k = int(config.get('lexical_search','TOP_K'))
|
61 |
+
bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
62 |
+
top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
|
63 |
+
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
64 |
+
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
65 |
+
return bm25_hits
|
66 |
+
|
67 |
+
@st.cache(allow_output_mutation=True)
|
68 |
+
def load_sentenceTransformer(name):
|
69 |
+
return SentenceTransformer(name)
|
70 |
+
|
71 |
+
|
72 |
+
def semantic_search(keywordlist,paraList):
|
73 |
+
|
74 |
+
##### Sematic Search #####
|
75 |
+
#query = "Does document contain {} issues ?".format(keyword)
|
76 |
+
config = configparser.ConfigParser()
|
77 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
78 |
+
model_name = config.get('semantic_search','MODEL_NAME')
|
79 |
+
|
80 |
+
bi_encoder = load_sentenceTransformer(model_name)
|
81 |
+
bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
|
82 |
+
top_k = int(config.get('semantic_search','TOP_K'))
|
83 |
+
document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
84 |
+
question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
|
85 |
+
|
86 |
+
hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
87 |
+
|
88 |
+
return hits
|
89 |
+
|
90 |
+
def show_results(keywordList):
|
91 |
+
document = docx.Document()
|
92 |
+
# document.add_heading('Document name:{}'.format(file_name), 2)
|
93 |
+
section = document.sections[0]
|
94 |
+
|
95 |
+
# Calling the footer
|
96 |
+
footer = section.footer
|
97 |
+
|
98 |
+
# Calling the paragraph already present in
|
99 |
+
# the footer section
|
100 |
+
footer_para = footer.paragraphs[0]
|
101 |
+
|
102 |
+
font_styles = document.styles
|
103 |
+
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
|
104 |
+
font_object = font_charstyle.font
|
105 |
+
font_object.size = Pt(7)
|
106 |
+
# Adding the centered zoned footer
|
107 |
+
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
|
108 |
+
document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
|
109 |
+
for keyword in keywordList:
|
110 |
+
|
111 |
+
st.write("Results for Query: {}".format(keyword))
|
112 |
+
para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
|
113 |
+
para.font.size = Pt(12)
|
114 |
+
bm25_hits, hits = search(keyword)
|
115 |
+
|
116 |
+
st.markdown("""
|
117 |
+
We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
118 |
+
""")
|
119 |
+
# In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
120 |
+
st.markdown("Top few lexical search (BM25) hits")
|
121 |
+
document.add_paragraph("Top few lexical search (BM25) hits")
|
122 |
+
|
123 |
+
for hit in bm25_hits[0:5]:
|
124 |
+
if hit['score'] > 0.00:
|
125 |
+
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
126 |
+
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
# st.table(bm25_hits[0:3])
|
131 |
+
|
132 |
+
st.markdown("\n-------------------------\n")
|
133 |
+
st.markdown("Top few Bi-Encoder Retrieval hits")
|
134 |
+
document.add_paragraph("\n-------------------------\n")
|
135 |
+
document.add_paragraph("Top few Bi-Encoder Retrieval hits")
|
136 |
+
|
137 |
+
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
138 |
+
for hit in hits[0:5]:
|
139 |
+
# if hit['score'] > 0.45:
|
140 |
+
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
141 |
+
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
ver0.1 scripts/uploadAndExample.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile
|
3 |
+
import udfPreprocess.docPreprocessing as pre
|
4 |
+
import udfPreprocess.cleaning as clean
|
5 |
+
|
6 |
+
def add_upload(choice):
|
7 |
+
|
8 |
+
|
9 |
+
if choice == 'Upload Document':
|
10 |
+
uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
11 |
+
if uploaded_file is not None:
|
12 |
+
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
13 |
+
bytes_data = uploaded_file.getvalue()
|
14 |
+
temp.write(bytes_data)
|
15 |
+
st.session_state['filename'] = uploaded_file.name
|
16 |
+
# st.write("Uploaded Filename: ", uploaded_file.name)
|
17 |
+
file_name = uploaded_file.name
|
18 |
+
file_path = temp.name
|
19 |
+
# docs = pre.load_document(file_path, file_name)
|
20 |
+
# haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
21 |
+
st.session_state['filename'] = file_name
|
22 |
+
# st.session_state['paraList'] = paraList
|
23 |
+
st.session_state['filepath'] = file_path
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
else:
|
28 |
+
# listing the options
|
29 |
+
option = st.sidebar.selectbox('Select the example document',
|
30 |
+
('South Africa:Low Emission strategy',
|
31 |
+
'Ethiopia: 10 Year Development Plan'))
|
32 |
+
if option is 'South Africa:Low Emission strategy':
|
33 |
+
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
34 |
+
st.session_state['filename'] = file_name
|
35 |
+
st.sesion_state['filepath'] = file_path
|
36 |
+
# st.write("Selected document:", file_name.split('/')[1])
|
37 |
+
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
38 |
+
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
39 |
+
else:
|
40 |
+
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
41 |
+
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
42 |
+
st.session_state['filename'] = file_name
|
43 |
+
st.session_state['filepath'] = file_path
|
44 |
+
# st.write("Selected document:", file_name.split('/')[1])
|
45 |
+
|
46 |
+
# if option is not None:
|
47 |
+
# docs = pre.load_document(file_path,file_name)
|
48 |
+
# haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
49 |
+
# st.session_state['docs'] = docs
|
50 |
+
# st.session_state['paraList'] = paraList
|
51 |
+
|
52 |
+
|