Delete appStore
Browse files- appStore/__init__.py +0 -1
- appStore/coherence.py +0 -156
- appStore/info.py +0 -72
- appStore/keyword_search.py +0 -176
- appStore/multiapp.py +0 -70
- appStore/sdg_analysis.py +0 -179
appStore/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# creating appstore package
|
|
|
|
appStore/coherence.py
DELETED
@@ -1,156 +0,0 @@
|
|
1 |
-
# set path
|
2 |
-
import glob, os, sys;
|
3 |
-
sys.path.append('../utils')
|
4 |
-
|
5 |
-
import streamlit as st
|
6 |
-
import ast
|
7 |
-
import logging
|
8 |
-
from utils.ndc_explorer import countrySpecificCCA, countrySpecificCCM
|
9 |
-
from utils.checkconfig import getconfig
|
10 |
-
from utils.semantic_search import runSemanticPreprocessingPipeline,process_semantic_output
|
11 |
-
from utils.semantic_search import semanticSearchPipeline, runSemanticPipeline
|
12 |
-
from st_aggrid import AgGrid
|
13 |
-
from st_aggrid.shared import ColumnsAutoSizeMode
|
14 |
-
|
15 |
-
# Reading data and Declaring necessary variables
|
16 |
-
with open('docStore/ndcs/countryList.txt') as dfile:
|
17 |
-
countryList = dfile.read()
|
18 |
-
countryList = ast.literal_eval(countryList)
|
19 |
-
countrynames = list(countryList.keys())
|
20 |
-
|
21 |
-
with open('docStore/ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
|
22 |
-
cca_sent = dfile.read()
|
23 |
-
cca_sent = ast.literal_eval(cca_sent)
|
24 |
-
|
25 |
-
with open('docStore/ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
|
26 |
-
ccm_sent = dfile.read()
|
27 |
-
ccm_sent = ast.literal_eval(ccm_sent)
|
28 |
-
|
29 |
-
config = getconfig('paramconfig.cfg')
|
30 |
-
split_by = config.get('coherence','SPLIT_BY')
|
31 |
-
split_length = int(config.get('coherence','SPLIT_LENGTH'))
|
32 |
-
split_overlap = int(config.get('coherence','SPLIT_OVERLAP'))
|
33 |
-
split_respect_sentence_boundary = bool(int(config.get('coherence',
|
34 |
-
'RESPECT_SENTENCE_BOUNDARY')))
|
35 |
-
remove_punc = bool(int(config.get('coherence','REMOVE_PUNC')))
|
36 |
-
embedding_model = config.get('coherence','RETRIEVER')
|
37 |
-
embedding_model_format = config.get('coherence','RETRIEVER_FORMAT')
|
38 |
-
embedding_layer = int(config.get('coherence','RETRIEVER_EMB_LAYER'))
|
39 |
-
embedding_dim = int(config.get('coherence','EMBEDDING_DIM'))
|
40 |
-
max_seq_len = int(config.get('coherence','MAX_SEQ_LENGTH'))
|
41 |
-
retriever_top_k = int(config.get('coherence','RETRIEVER_TOP_K'))
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
def app():
|
46 |
-
|
47 |
-
#### APP INFO #####
|
48 |
-
with st.container():
|
49 |
-
st.markdown("<h1 style='text-align: center; \
|
50 |
-
color: black;'> NDC Comparison</h1>",
|
51 |
-
unsafe_allow_html=True)
|
52 |
-
st.write(' ')
|
53 |
-
st.write(' ')
|
54 |
-
with st.expander("ℹ️ - About this app", expanded=False):
|
55 |
-
|
56 |
-
st.write(
|
57 |
-
"""
|
58 |
-
The *NDC Comparison* application provides easy evaluation of
|
59 |
-
coherence between a given policy document and a country’s (Intended)\
|
60 |
-
Nationally Determined Contribution (INDCs/NDCs) using open-source \
|
61 |
-
data from the German Institute of Development and Sustainability’s \
|
62 |
-
(IDOS) [NDC Explorer](https://klimalog.idos-research.de/ndc/#NDCExplorer/worldMap?NewAndUpdatedNDC??income???catIncome).\
|
63 |
-
""")
|
64 |
-
st.write("")
|
65 |
-
st.write(""" User can select a country context via the drop-down menu \
|
66 |
-
on the left-hand side of the application. Subsequently, the user is \
|
67 |
-
given the opportunity to manually upload another policy document \
|
68 |
-
from the same national context or to select a pre-loaded example \
|
69 |
-
document. Thereafter, the user can choose between two categories \
|
70 |
-
to compare coherence between the documents: climate change adaptation \
|
71 |
-
and climate change mitigation. Based on the selected information, \
|
72 |
-
the application identifies relevant paragraphs in the uploaded \
|
73 |
-
document and assigns them to the respective indicator from the NDC \
|
74 |
-
Explorer. Currently, the NDC Explorer has 20 indicators under \
|
75 |
-
climate change mitigation (e.g., fossil fuel production, REDD+) and \
|
76 |
-
22 indicators under climate change adaptation (e.g., sea level rise,\
|
77 |
-
investment needs). The assignment of the paragraph to a corresponding\
|
78 |
-
indicator is based on vector similarities in which top 3 results
|
79 |
-
if found are shown to the user. """)
|
80 |
-
st.write("")
|
81 |
-
st.write("")
|
82 |
-
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
83 |
-
col1,col2= st.columns(2)
|
84 |
-
with col1:
|
85 |
-
st.caption("OCR File processing")
|
86 |
-
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
87 |
-
st.write("50 sec")
|
88 |
-
|
89 |
-
with col2:
|
90 |
-
st.caption("NDC comparison on 200 paragraphs(~ 35 pages)")
|
91 |
-
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
92 |
-
st.write("140 sec")
|
93 |
-
|
94 |
-
with st.sidebar:
|
95 |
-
|
96 |
-
option = st.selectbox('Select Country', (countrynames))
|
97 |
-
countryCode = countryList[option]
|
98 |
-
st.markdown("---")
|
99 |
-
|
100 |
-
genre = st.radio( "Select Category",('Climate Change Adaptation',
|
101 |
-
'Climate Change Mitigation'))
|
102 |
-
st.markdown("---")
|
103 |
-
|
104 |
-
with st.container():
|
105 |
-
if st.button("Compare with NDC"):
|
106 |
-
sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
|
107 |
-
sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
|
108 |
-
|
109 |
-
if 'filepath' in st.session_state:
|
110 |
-
allDocuments = runSemanticPreprocessingPipeline(
|
111 |
-
file_path= st.session_state['filepath'],
|
112 |
-
file_name = st.session_state['filename'],
|
113 |
-
split_by=split_by,
|
114 |
-
split_length= split_length,
|
115 |
-
split_overlap=split_overlap,
|
116 |
-
remove_punc= remove_punc,
|
117 |
-
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
118 |
-
# genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
|
119 |
-
if genre == 'Climate Change Adaptation':
|
120 |
-
sent_dict = sent_cca
|
121 |
-
else:
|
122 |
-
sent_dict = sent_ccm
|
123 |
-
sent_labels = []
|
124 |
-
for key,sent in sent_dict.items():
|
125 |
-
sent_labels.append(sent)
|
126 |
-
if len(allDocuments['documents']) > 100:
|
127 |
-
warning_msg = ": This might take sometime, please sit back and relax."
|
128 |
-
else:
|
129 |
-
warning_msg = ""
|
130 |
-
logging.info("starting Coherence analysis, \
|
131 |
-
country selected {}".format(option))
|
132 |
-
with st.spinner("Performing Coherence Analysis for {} \
|
133 |
-
under {} category{}".format(option,genre,warning_msg)):
|
134 |
-
semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = allDocuments['documents'],
|
135 |
-
embedding_model= embedding_model,
|
136 |
-
embedding_layer= embedding_layer,
|
137 |
-
embedding_model_format= embedding_model_format,
|
138 |
-
retriever_top_k= retriever_top_k,
|
139 |
-
embedding_dim=embedding_dim,
|
140 |
-
max_seq_len=max_seq_len, useQueryCheck=False)
|
141 |
-
raw_output = runSemanticPipeline(pipeline=semanticsearch_pipeline,queries=sent_labels)
|
142 |
-
results_df = process_semantic_output(raw_output)
|
143 |
-
results_df = results_df.drop(['answer','answer_offset',
|
144 |
-
'context_offset','context','reader_score','id'],
|
145 |
-
axis = 1)
|
146 |
-
|
147 |
-
for i,key in enumerate(list(sent_dict.keys())):
|
148 |
-
st.subheader("Relevant paragraphs for topic: {}".format(key))
|
149 |
-
df = results_df[results_df['query']==sent_dict[key]].reset_index(drop=True)
|
150 |
-
for j in range(3):
|
151 |
-
st.write('Result {}.'.format(j+1))
|
152 |
-
st.write(df.loc[j]['content']+'\n')
|
153 |
-
|
154 |
-
else:
|
155 |
-
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
156 |
-
logging.warning("Terminated as no document provided")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
appStore/info.py
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
def app():
|
4 |
-
|
5 |
-
|
6 |
-
with open('style.css') as f:
|
7 |
-
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
8 |
-
|
9 |
-
st.markdown("<h2 style='text-align: center; \
|
10 |
-
color: black;'> Policy Action Tracker</h2>",
|
11 |
-
unsafe_allow_html=True)
|
12 |
-
|
13 |
-
|
14 |
-
st.markdown("<div style='text-align: center; \
|
15 |
-
color: grey;'>The Policy Action Tracker is an open-source\
|
16 |
-
digital tool which aims to assist policy analysts and \
|
17 |
-
other users in extracting and filtering relevant \
|
18 |
-
information from policy documents.</div>",
|
19 |
-
unsafe_allow_html=True)
|
20 |
-
footer = """
|
21 |
-
<div class="footer-custom">
|
22 |
-
Guidance & Feedback - <a href="https://www.linkedin.com/in/maren-bernlöhr-149891222" target="_blank">Maren Bernlöhr</a> |
|
23 |
-
<a href="https://www.linkedin.com/in/manuelkuhm" target="_blank">Manuel Kuhm</a> |
|
24 |
-
Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
|
25 |
-
<a href="https://www.linkedin.com/in/jonas-nothnagel-bb42b114b/" target="_blank">Jonas Nothnagel</a> |
|
26 |
-
<a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
|
27 |
-
|
28 |
-
</div>
|
29 |
-
"""
|
30 |
-
st.markdown(footer, unsafe_allow_html=True)
|
31 |
-
|
32 |
-
c1, c2, c3 = st.columns([8,1,12])
|
33 |
-
with c1:
|
34 |
-
st.image("docStore/img/ndc.png")
|
35 |
-
with c3:
|
36 |
-
st.markdown('<div style="text-align: justify;">The manual extraction \
|
37 |
-
of relevant information from text documents is a \
|
38 |
-
time-consuming task for any policy analyst. As the amount and length of \
|
39 |
-
public policy documents in relation to sustainable development (such as \
|
40 |
-
National Development Plans and Nationally Determined Contributions) \
|
41 |
-
continuously increases, a major challenge for policy action tracking – the \
|
42 |
-
evaluation of stated goals and targets and their actual implementation on \
|
43 |
-
the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
|
44 |
-
Language Processing (NLP) methods can help in shortening and easing this \
|
45 |
-
task for policy analysts.</div><br>',
|
46 |
-
unsafe_allow_html=True)
|
47 |
-
|
48 |
-
intro = """
|
49 |
-
<div style="text-align: justify;">
|
50 |
-
|
51 |
-
For this purpose, the United Nations Sustainable Development Solutions \
|
52 |
-
Network (SDSN) and the Deutsche Gesellschaft für Internationale \
|
53 |
-
Zusammenarbeit (GIZ) GmbH are collaborated in the development \
|
54 |
-
of this AI-powered open-source web application that helps find and extract \
|
55 |
-
relevant information from public policy documents faster to facilitate \
|
56 |
-
evidence-based decision-making processes in sustainable development and beyond.
|
57 |
-
|
58 |
-
This tool allows policy analysts and other users the possibility to rapidly \
|
59 |
-
search for relevant information/paragraphs in the document according to the \
|
60 |
-
user’s interest, classify the document’s content according to the Sustainable \
|
61 |
-
Development Goals (SDGs), and compare climate-related policy documents and NDCs \
|
62 |
-
across countries using open data from the German Institute of Development and \
|
63 |
-
Sustainability’s (IDOS) NDC Explorer.
|
64 |
-
To understand the application's functionalities and learn more about ß
|
65 |
-
the project, see the attached concept note. We hope you like our application 😊
|
66 |
-
|
67 |
-
|
68 |
-
</div>
|
69 |
-
<br>
|
70 |
-
"""
|
71 |
-
st.markdown(intro, unsafe_allow_html=True)
|
72 |
-
# st.image("docStore/img/paris.png")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
appStore/keyword_search.py
DELETED
@@ -1,176 +0,0 @@
|
|
1 |
-
# set path
|
2 |
-
import glob, os, sys;
|
3 |
-
sys.path.append('../utils')
|
4 |
-
|
5 |
-
import streamlit as st
|
6 |
-
import json
|
7 |
-
import logging
|
8 |
-
from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
|
9 |
-
from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
|
10 |
-
from utils.checkconfig import getconfig
|
11 |
-
from utils.streamlitcheck import checkbox_without_preselect
|
12 |
-
|
13 |
-
# Declare all the necessary variables
|
14 |
-
config = getconfig('paramconfig.cfg')
|
15 |
-
split_by = config.get('semantic_search','SPLIT_BY')
|
16 |
-
split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
|
17 |
-
split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
|
18 |
-
split_respect_sentence_boundary = bool(int(config.get('semantic_search',
|
19 |
-
'RESPECT_SENTENCE_BOUNDARY')))
|
20 |
-
remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
|
21 |
-
embedding_model = config.get('semantic_search','RETRIEVER')
|
22 |
-
embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
|
23 |
-
embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
|
24 |
-
embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
|
25 |
-
max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
|
26 |
-
retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
27 |
-
reader_model = config.get('semantic_search','READER')
|
28 |
-
reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
|
29 |
-
top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE'))
|
30 |
-
lexical_split_by= config.get('lexical_search','SPLIT_BY')
|
31 |
-
lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
|
32 |
-
lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
33 |
-
lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
|
34 |
-
lexical_top_k=int(config.get('lexical_search','TOP_K'))
|
35 |
-
|
36 |
-
def app():
|
37 |
-
|
38 |
-
with st.container():
|
39 |
-
st.markdown("<h1 style='text-align: center; \
|
40 |
-
color: black;'> Search</h1>",
|
41 |
-
unsafe_allow_html=True)
|
42 |
-
st.write(' ')
|
43 |
-
st.write(' ')
|
44 |
-
|
45 |
-
with st.expander("ℹ️ - About this app", expanded=False):
|
46 |
-
|
47 |
-
st.write(
|
48 |
-
"""
|
49 |
-
The *Search* app is an interface \
|
50 |
-
for doing contextual and keyword searches in \
|
51 |
-
policy documents. \
|
52 |
-
""")
|
53 |
-
st.write("")
|
54 |
-
st.write(""" The application allows its user to perform a search\
|
55 |
-
based on two options: a lexical search([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
|
56 |
-
and semantic search. [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\
|
57 |
-
The lexical search only \
|
58 |
-
displays paragraphs in the document with exact matching results, \
|
59 |
-
the semantic search shows paragraphs with meaningful connections \
|
60 |
-
(e.g., synonyms) based on the search context. Both \
|
61 |
-
methods employ a probabilistic retrieval framework in its identification\
|
62 |
-
of relevant paragraphs. By defualt the search is performed using \
|
63 |
-
'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
|
64 |
-
checkbox provided which will by-pass semantic search. Furthermore,\
|
65 |
-
the application allows the user to search for pre-defined keywords \
|
66 |
-
from different thematic buckets present in sidebar.""")
|
67 |
-
st.write("")
|
68 |
-
st.write(""" The Exact Matches gives back top {} findings, and Semantic
|
69 |
-
search provides with top {} answers.""".format(lexical_top_k, retriever_top_k))
|
70 |
-
st.write("")
|
71 |
-
st.write("")
|
72 |
-
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
73 |
-
col1,col2,col3= st.columns([2,4,4])
|
74 |
-
with col1:
|
75 |
-
st.caption("OCR File processing")
|
76 |
-
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
77 |
-
st.write("50 sec")
|
78 |
-
|
79 |
-
with col2:
|
80 |
-
st.caption("Lexical Search on 200 paragraphs(~ 35 pages)")
|
81 |
-
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
82 |
-
st.write("15 sec")
|
83 |
-
|
84 |
-
with col3:
|
85 |
-
st.caption("Semantic search on 200 paragraphs(~ 35 pages)")
|
86 |
-
# st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
|
87 |
-
st.write("120 sec(including emebedding creation)")
|
88 |
-
|
89 |
-
with st.sidebar:
|
90 |
-
with open('docStore/sample/keywordexample.json','r') as json_file:
|
91 |
-
keywordexample = json.load(json_file)
|
92 |
-
|
93 |
-
# genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
|
94 |
-
st.caption("Select Keyword Category")
|
95 |
-
genre = checkbox_without_preselect(list(keywordexample.keys()))
|
96 |
-
if genre:
|
97 |
-
keywordList = keywordexample[genre]
|
98 |
-
else:
|
99 |
-
keywordList = None
|
100 |
-
|
101 |
-
st.markdown("---")
|
102 |
-
|
103 |
-
with st.container():
|
104 |
-
type_hinting = "Please enter here your question and we \
|
105 |
-
will look for an answer in the document\
|
106 |
-
OR enter the keyword you are looking \
|
107 |
-
for and we will look for similar\
|
108 |
-
context in the document.\
|
109 |
-
You can also explore predefined sets of keywords from sidebar. "
|
110 |
-
if keywordList is not None:
|
111 |
-
# queryList = st.text_input("You selected the {} category we \
|
112 |
-
# will look for these keywords in document".format(genre)
|
113 |
-
# value="{}".format(keywordList))
|
114 |
-
queryList = st.text_input(type_hinting,
|
115 |
-
value = "{}".format(keywordList))
|
116 |
-
else:
|
117 |
-
queryList = st.text_input(type_hinting,
|
118 |
-
placeholder="Enter keyword/query here")
|
119 |
-
|
120 |
-
searchtype = st.checkbox("Show only Exact Matches")
|
121 |
-
if st.button("Find them"):
|
122 |
-
|
123 |
-
if queryList == "":
|
124 |
-
st.info("🤔 No keyword provided, if you dont have any, \
|
125 |
-
please try example sets from sidebar!")
|
126 |
-
logging.warning("Terminated as no keyword provided")
|
127 |
-
else:
|
128 |
-
if 'filepath' in st.session_state:
|
129 |
-
|
130 |
-
if searchtype:
|
131 |
-
all_documents = runLexicalPreprocessingPipeline(
|
132 |
-
file_name=st.session_state['filename'],
|
133 |
-
file_path=st.session_state['filepath'],
|
134 |
-
split_by=lexical_split_by,
|
135 |
-
split_length=lexical_split_length,
|
136 |
-
split_overlap=lexical_split_overlap,
|
137 |
-
remove_punc=lexical_remove_punc)
|
138 |
-
logging.info("performing lexical search")
|
139 |
-
with st.spinner("Performing Exact matching search \
|
140 |
-
(Lexical search) for you"):
|
141 |
-
lexical_search(query=queryList,
|
142 |
-
documents = all_documents['documents'],
|
143 |
-
top_k = lexical_top_k )
|
144 |
-
else:
|
145 |
-
all_documents = runSemanticPreprocessingPipeline(
|
146 |
-
file_path= st.session_state['filepath'],
|
147 |
-
file_name = st.session_state['filename'],
|
148 |
-
split_by=split_by,
|
149 |
-
split_length= split_length,
|
150 |
-
split_overlap=split_overlap,
|
151 |
-
remove_punc= remove_punc,
|
152 |
-
split_respect_sentence_boundary=split_respect_sentence_boundary)
|
153 |
-
if len(all_documents['documents']) > 100:
|
154 |
-
warning_msg = ": This might take sometime, please sit back and relax."
|
155 |
-
else:
|
156 |
-
warning_msg = ""
|
157 |
-
|
158 |
-
logging.info("starting semantic search")
|
159 |
-
with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
|
160 |
-
semantic_keywordsearch(query = queryList,
|
161 |
-
documents = all_documents['documents'],
|
162 |
-
embedding_model=embedding_model,
|
163 |
-
embedding_layer=embedding_layer,
|
164 |
-
embedding_model_format=embedding_model_format,
|
165 |
-
reader_model=reader_model,reader_top_k=reader_top_k,
|
166 |
-
retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
|
167 |
-
max_seq_len=max_seq_len,
|
168 |
-
top_k_per_candidate = top_k_per_candidate)
|
169 |
-
|
170 |
-
else:
|
171 |
-
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
172 |
-
logging.warning("Terminated as no document provided")
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
appStore/multiapp.py
DELETED
@@ -1,70 +0,0 @@
|
|
1 |
-
"""Frameworks for running multiple Streamlit applications as a single app.
|
2 |
-
"""
|
3 |
-
import streamlit as st
|
4 |
-
from PIL import Image
|
5 |
-
from streamlit_option_menu import option_menu
|
6 |
-
from utils.uploadAndExample import add_upload
|
7 |
-
|
8 |
-
class MultiApp:
|
9 |
-
"""Framework for combining multiple streamlit applications.
|
10 |
-
Usage:
|
11 |
-
def foo():
|
12 |
-
st.title("Hello Foo")
|
13 |
-
def bar():
|
14 |
-
st.title("Hello Bar")
|
15 |
-
app = MultiApp()
|
16 |
-
app.add_app("Foo", foo)
|
17 |
-
app.add_app("Bar", bar)
|
18 |
-
app.run()
|
19 |
-
It is also possible keep each application in a separate file.
|
20 |
-
import foo
|
21 |
-
import bar
|
22 |
-
app = MultiApp()
|
23 |
-
app.add_app("Foo", foo.app)
|
24 |
-
app.add_app("Bar", bar.app)
|
25 |
-
app.run()
|
26 |
-
"""
|
27 |
-
def __init__(self):
|
28 |
-
self.apps = []
|
29 |
-
|
30 |
-
def add_app(self,title,icon, func):
|
31 |
-
"""Adds a new application.
|
32 |
-
Parameters
|
33 |
-
----------
|
34 |
-
func:
|
35 |
-
the python function to render this app.
|
36 |
-
title:
|
37 |
-
title of the app. Appears in the dropdown in the sidebar.
|
38 |
-
"""
|
39 |
-
self.apps.append({
|
40 |
-
"title": title,
|
41 |
-
"icon": icon,
|
42 |
-
"function": func
|
43 |
-
})
|
44 |
-
|
45 |
-
def run(self):
|
46 |
-
|
47 |
-
st.sidebar.write(format_func=lambda app: app['title'])
|
48 |
-
image = Image.open('docStore/img/sdsn.png')
|
49 |
-
st.sidebar.image(image, width =200)
|
50 |
-
|
51 |
-
with st.sidebar:
|
52 |
-
selected = option_menu(None, [page["title"] for page in self.apps],
|
53 |
-
icons=[page["icon"] for page in self.apps],
|
54 |
-
menu_icon="cast", default_index=0)
|
55 |
-
st.markdown("---")
|
56 |
-
|
57 |
-
|
58 |
-
for index, item in enumerate(self.apps):
|
59 |
-
if item["title"] == selected:
|
60 |
-
self.apps[index]["function"]()
|
61 |
-
break
|
62 |
-
|
63 |
-
|
64 |
-
choice = st.sidebar.radio(label = 'Select the Document',
|
65 |
-
help = 'You can upload the document \
|
66 |
-
or else you can try a example document',
|
67 |
-
options = ('Upload Document', 'Try Example'),
|
68 |
-
horizontal = True)
|
69 |
-
add_upload(choice)
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
appStore/sdg_analysis.py
DELETED
@@ -1,179 +0,0 @@
|
|
1 |
-
# set path
|
2 |
-
import glob, os, sys;
|
3 |
-
sys.path.append('../utils')
|
4 |
-
|
5 |
-
#import needed libraries
|
6 |
-
import seaborn as sns
|
7 |
-
import matplotlib.pyplot as plt
|
8 |
-
import numpy as np
|
9 |
-
import pandas as pd
|
10 |
-
import streamlit as st
|
11 |
-
from st_aggrid import AgGrid
|
12 |
-
from st_aggrid.shared import ColumnsAutoSizeMode
|
13 |
-
from utils.sdg_classifier import sdg_classification
|
14 |
-
from utils.sdg_classifier import runSDGPreprocessingPipeline, load_sdgClassifier
|
15 |
-
from utils.keyword_extraction import textrank
|
16 |
-
import logging
|
17 |
-
logger = logging.getLogger(__name__)
|
18 |
-
from utils.checkconfig import getconfig
|
19 |
-
|
20 |
-
|
21 |
-
# Declare all the necessary variables
|
22 |
-
config = getconfig('paramconfig.cfg')
|
23 |
-
model_name = config.get('sdg','MODEL')
|
24 |
-
split_by = config.get('sdg','SPLIT_BY')
|
25 |
-
split_length = int(config.get('sdg','SPLIT_LENGTH'))
|
26 |
-
split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
|
27 |
-
remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
|
28 |
-
split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
|
29 |
-
threshold = float(config.get('sdg','THRESHOLD'))
|
30 |
-
top_n = int(config.get('sdg','TOP_KEY'))
|
31 |
-
|
32 |
-
|
33 |
-
def app():
|
34 |
-
|
35 |
-
#### APP INFO #####
|
36 |
-
with st.container():
|
37 |
-
st.markdown("<h1 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h1>", unsafe_allow_html=True)
|
38 |
-
st.write(' ')
|
39 |
-
st.write(' ')
|
40 |
-
|
41 |
-
with st.expander("ℹ️ - About this app", expanded=False):
|
42 |
-
|
43 |
-
st.write(
|
44 |
-
"""
|
45 |
-
The *SDG Analysis* app is an easy-to-use interface built \
|
46 |
-
in Streamlit for analyzing policy documents with respect to SDG \
|
47 |
-
Classification for the paragraphs/texts in the document and \
|
48 |
-
extracting the keyphrase per SDG label - developed by GIZ Data \
|
49 |
-
and the Sustainable Development Solution Network. \n
|
50 |
-
""")
|
51 |
-
st.write("""**Document Processing:** The Uploaded/Selected document is \
|
52 |
-
automatically cleaned and split into paragraphs with a maximum \
|
53 |
-
length of 120 words using a Haystack preprocessing pipeline. The \
|
54 |
-
length of 120 is an empirical value which should reflect the length \
|
55 |
-
of a “context” and should limit the paragraph length deviation. \
|
56 |
-
However, since we want to respect the sentence boundary the limit \
|
57 |
-
can breach and hence this limit of 120 is tentative. \n
|
58 |
-
""")
|
59 |
-
st.write("""**SDG cLassification:** The application assigns paragraphs \
|
60 |
-
to 16 of the 17 United Nations Sustainable Development Goals (SDGs).\
|
61 |
-
SDG 17 “Partnerships for the Goals” is excluded from the analysis due \
|
62 |
-
to its broad nature which could potentially inflate the results. \
|
63 |
-
Each paragraph is assigned to one SDG only. Again, the results are \
|
64 |
-
displayed in a summary table including the number of the SDG, a \
|
65 |
-
relevancy score highlighted through a green color shading, and the \
|
66 |
-
respective text of the analyzed paragraph. Additionally, a pie \
|
67 |
-
chart with a blue color shading is displayed which illustrates the \
|
68 |
-
three most prominent SDGs in the document. The SDG classification \
|
69 |
-
uses open-source training [data](https://zenodo.org/record/5550238#.Y25ICHbMJPY) \
|
70 |
-
from [OSDG.ai](https://osdg.ai/) which is a global \
|
71 |
-
partnerships and growing community of researchers and institutions \
|
72 |
-
interested in the classification of research according to the \
|
73 |
-
Sustainable Development Goals. The summary table only displays \
|
74 |
-
paragraphs with a calculated relevancy score above 85%. \n""")
|
75 |
-
|
76 |
-
st.write("""**Keyphrase Extraction:** The application extracts 15 \
|
77 |
-
keyphrases from the document, for each SDG label and displays the \
|
78 |
-
results in a summary table. The keyphrases are extracted using \
|
79 |
-
using [Textrank](https://github.com/summanlp/textrank)\
|
80 |
-
which is an easy-to-use computational less expensive \
|
81 |
-
model leveraging combination of TFIDF and Graph networks.
|
82 |
-
""")
|
83 |
-
st.write("")
|
84 |
-
st.write("")
|
85 |
-
st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
|
86 |
-
col1,col2,col3,col4 = st.columns([2,2,4,4])
|
87 |
-
with col1:
|
88 |
-
st.caption("Loading Time Classifier")
|
89 |
-
# st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
|
90 |
-
st.write("12 sec")
|
91 |
-
with col2:
|
92 |
-
st.caption("OCR File processing")
|
93 |
-
# st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
|
94 |
-
st.write("50 sec")
|
95 |
-
with col3:
|
96 |
-
st.caption("SDG Classification of 200 paragraphs(~ 35 pages)")
|
97 |
-
# st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
|
98 |
-
st.write("120 sec")
|
99 |
-
with col4:
|
100 |
-
st.caption("Keyword extraction for 200 paragraphs(~ 35 pages)")
|
101 |
-
# st.markdown('<div style="text-align: center;">3 sec</div>', unsafe_allow_html=True)
|
102 |
-
st.write("3 sec")
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
### Main app code ###
|
108 |
-
with st.container():
|
109 |
-
if st.button("RUN SDG Analysis"):
|
110 |
-
|
111 |
-
if 'filepath' in st.session_state:
|
112 |
-
file_name = st.session_state['filename']
|
113 |
-
file_path = st.session_state['filepath']
|
114 |
-
classifier = load_sdgClassifier(classifier_name=model_name)
|
115 |
-
st.session_state['sdg_classifier'] = classifier
|
116 |
-
all_documents = runSDGPreprocessingPipeline(file_name= file_name,
|
117 |
-
file_path= file_path, split_by= split_by,
|
118 |
-
split_length= split_length,
|
119 |
-
split_respect_sentence_boundary= split_respect_sentence_boundary,
|
120 |
-
split_overlap= split_overlap, remove_punc= remove_punc)
|
121 |
-
|
122 |
-
if len(all_documents['documents']) > 100:
|
123 |
-
warning_msg = ": This might take sometime, please sit back and relax."
|
124 |
-
else:
|
125 |
-
warning_msg = ""
|
126 |
-
|
127 |
-
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
128 |
-
|
129 |
-
df, x = sdg_classification(haystack_doc=all_documents['documents'],
|
130 |
-
threshold= threshold)
|
131 |
-
df = df.drop(['Relevancy'], axis = 1)
|
132 |
-
sdg_labels = x.SDG.unique()
|
133 |
-
textrank_keyword_list = []
|
134 |
-
for label in sdg_labels:
|
135 |
-
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
136 |
-
textranklist_ = textrank(textdata=sdgdata, words= top_n)
|
137 |
-
if len(textranklist_) > 0:
|
138 |
-
textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
|
139 |
-
textrank_keywords_df = pd.DataFrame(textrank_keyword_list)
|
140 |
-
|
141 |
-
|
142 |
-
plt.rcParams['font.size'] = 25
|
143 |
-
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
144 |
-
# plot
|
145 |
-
fig, ax = plt.subplots()
|
146 |
-
ax.pie(x['count'], colors=colors, radius=2, center=(4, 4),
|
147 |
-
wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
148 |
-
textprops={'fontsize': 14},
|
149 |
-
frame=False,labels =list(x.SDG_Num),
|
150 |
-
labeldistance=1.2)
|
151 |
-
# fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
|
152 |
-
|
153 |
-
|
154 |
-
st.markdown("#### Anything related to SDGs? ####")
|
155 |
-
|
156 |
-
c4, c5, c6 = st.columns([1,2,2])
|
157 |
-
|
158 |
-
with c5:
|
159 |
-
st.pyplot(fig)
|
160 |
-
with c6:
|
161 |
-
labeldf = x['SDG_name'].values.tolist()
|
162 |
-
labeldf = "<br>".join(labeldf)
|
163 |
-
st.markdown(labeldf, unsafe_allow_html=True)
|
164 |
-
st.write("")
|
165 |
-
st.markdown("###### What keywords are present under SDG classified text? ######")
|
166 |
-
|
167 |
-
AgGrid(textrank_keywords_df, reload_data = False,
|
168 |
-
update_mode="value_changed",
|
169 |
-
columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
|
170 |
-
st.write("")
|
171 |
-
st.markdown("###### Top few SDG Classified paragraph/text results ######")
|
172 |
-
|
173 |
-
AgGrid(df, reload_data = False, update_mode="value_changed",
|
174 |
-
columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
|
175 |
-
else:
|
176 |
-
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
177 |
-
logging.warning("Terminated as no document provided")
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|