Spaces:
GIZ
/
Running on CPU Upgrade

ppsingh commited on
Commit
aa17801
·
verified ·
1 Parent(s): db1932e

Delete appStore

Browse files
appStore/__init__.py DELETED
@@ -1 +0,0 @@
1
- # creating appstore package
 
 
appStore/coherence.py DELETED
@@ -1,156 +0,0 @@
1
- # set path
2
- import glob, os, sys;
3
- sys.path.append('../utils')
4
-
5
- import streamlit as st
6
- import ast
7
- import logging
8
- from utils.ndc_explorer import countrySpecificCCA, countrySpecificCCM
9
- from utils.checkconfig import getconfig
10
- from utils.semantic_search import runSemanticPreprocessingPipeline,process_semantic_output
11
- from utils.semantic_search import semanticSearchPipeline, runSemanticPipeline
12
- from st_aggrid import AgGrid
13
- from st_aggrid.shared import ColumnsAutoSizeMode
14
-
15
- # Reading data and Declaring necessary variables
16
- with open('docStore/ndcs/countryList.txt') as dfile:
17
- countryList = dfile.read()
18
- countryList = ast.literal_eval(countryList)
19
- countrynames = list(countryList.keys())
20
-
21
- with open('docStore/ndcs/cca.txt', encoding='utf-8', errors='ignore') as dfile:
22
- cca_sent = dfile.read()
23
- cca_sent = ast.literal_eval(cca_sent)
24
-
25
- with open('docStore/ndcs/ccm.txt', encoding='utf-8', errors='ignore') as dfile:
26
- ccm_sent = dfile.read()
27
- ccm_sent = ast.literal_eval(ccm_sent)
28
-
29
- config = getconfig('paramconfig.cfg')
30
- split_by = config.get('coherence','SPLIT_BY')
31
- split_length = int(config.get('coherence','SPLIT_LENGTH'))
32
- split_overlap = int(config.get('coherence','SPLIT_OVERLAP'))
33
- split_respect_sentence_boundary = bool(int(config.get('coherence',
34
- 'RESPECT_SENTENCE_BOUNDARY')))
35
- remove_punc = bool(int(config.get('coherence','REMOVE_PUNC')))
36
- embedding_model = config.get('coherence','RETRIEVER')
37
- embedding_model_format = config.get('coherence','RETRIEVER_FORMAT')
38
- embedding_layer = int(config.get('coherence','RETRIEVER_EMB_LAYER'))
39
- embedding_dim = int(config.get('coherence','EMBEDDING_DIM'))
40
- max_seq_len = int(config.get('coherence','MAX_SEQ_LENGTH'))
41
- retriever_top_k = int(config.get('coherence','RETRIEVER_TOP_K'))
42
-
43
-
44
-
45
- def app():
46
-
47
- #### APP INFO #####
48
- with st.container():
49
- st.markdown("<h1 style='text-align: center; \
50
- color: black;'> NDC Comparison</h1>",
51
- unsafe_allow_html=True)
52
- st.write(' ')
53
- st.write(' ')
54
- with st.expander("ℹ️ - About this app", expanded=False):
55
-
56
- st.write(
57
- """
58
- The *NDC Comparison* application provides easy evaluation of
59
- coherence between a given policy document and a country’s (Intended)\
60
- Nationally Determined Contribution (INDCs/NDCs) using open-source \
61
- data from the German Institute of Development and Sustainability’s \
62
- (IDOS) [NDC Explorer](https://klimalog.idos-research.de/ndc/#NDCExplorer/worldMap?NewAndUpdatedNDC??income???catIncome).\
63
- """)
64
- st.write("")
65
- st.write(""" User can select a country context via the drop-down menu \
66
- on the left-hand side of the application. Subsequently, the user is \
67
- given the opportunity to manually upload another policy document \
68
- from the same national context or to select a pre-loaded example \
69
- document. Thereafter, the user can choose between two categories \
70
- to compare coherence between the documents: climate change adaptation \
71
- and climate change mitigation. Based on the selected information, \
72
- the application identifies relevant paragraphs in the uploaded \
73
- document and assigns them to the respective indicator from the NDC \
74
- Explorer. Currently, the NDC Explorer has 20 indicators under \
75
- climate change mitigation (e.g., fossil fuel production, REDD+) and \
76
- 22 indicators under climate change adaptation (e.g., sea level rise,\
77
- investment needs). The assignment of the paragraph to a corresponding\
78
- indicator is based on vector similarities in which top 3 results
79
- if found are shown to the user. """)
80
- st.write("")
81
- st.write("")
82
- st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
83
- col1,col2= st.columns(2)
84
- with col1:
85
- st.caption("OCR File processing")
86
- # st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
87
- st.write("50 sec")
88
-
89
- with col2:
90
- st.caption("NDC comparison on 200 paragraphs(~ 35 pages)")
91
- # st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
92
- st.write("140 sec")
93
-
94
- with st.sidebar:
95
-
96
- option = st.selectbox('Select Country', (countrynames))
97
- countryCode = countryList[option]
98
- st.markdown("---")
99
-
100
- genre = st.radio( "Select Category",('Climate Change Adaptation',
101
- 'Climate Change Mitigation'))
102
- st.markdown("---")
103
-
104
- with st.container():
105
- if st.button("Compare with NDC"):
106
- sent_cca = countrySpecificCCA(cca_sent,1,countryCode)
107
- sent_ccm = countrySpecificCCM(ccm_sent,1,countryCode)
108
-
109
- if 'filepath' in st.session_state:
110
- allDocuments = runSemanticPreprocessingPipeline(
111
- file_path= st.session_state['filepath'],
112
- file_name = st.session_state['filename'],
113
- split_by=split_by,
114
- split_length= split_length,
115
- split_overlap=split_overlap,
116
- remove_punc= remove_punc,
117
- split_respect_sentence_boundary=split_respect_sentence_boundary)
118
- # genre = st.radio( "Select Category",('Climate Change Adaptation', 'Climate Change Mitigation'))
119
- if genre == 'Climate Change Adaptation':
120
- sent_dict = sent_cca
121
- else:
122
- sent_dict = sent_ccm
123
- sent_labels = []
124
- for key,sent in sent_dict.items():
125
- sent_labels.append(sent)
126
- if len(allDocuments['documents']) > 100:
127
- warning_msg = ": This might take sometime, please sit back and relax."
128
- else:
129
- warning_msg = ""
130
- logging.info("starting Coherence analysis, \
131
- country selected {}".format(option))
132
- with st.spinner("Performing Coherence Analysis for {} \
133
- under {} category{}".format(option,genre,warning_msg)):
134
- semanticsearch_pipeline, doc_store = semanticSearchPipeline(documents = allDocuments['documents'],
135
- embedding_model= embedding_model,
136
- embedding_layer= embedding_layer,
137
- embedding_model_format= embedding_model_format,
138
- retriever_top_k= retriever_top_k,
139
- embedding_dim=embedding_dim,
140
- max_seq_len=max_seq_len, useQueryCheck=False)
141
- raw_output = runSemanticPipeline(pipeline=semanticsearch_pipeline,queries=sent_labels)
142
- results_df = process_semantic_output(raw_output)
143
- results_df = results_df.drop(['answer','answer_offset',
144
- 'context_offset','context','reader_score','id'],
145
- axis = 1)
146
-
147
- for i,key in enumerate(list(sent_dict.keys())):
148
- st.subheader("Relevant paragraphs for topic: {}".format(key))
149
- df = results_df[results_df['query']==sent_dict[key]].reset_index(drop=True)
150
- for j in range(3):
151
- st.write('Result {}.'.format(j+1))
152
- st.write(df.loc[j]['content']+'\n')
153
-
154
- else:
155
- st.info("🤔 No document found, please try to upload it at the sidebar!")
156
- logging.warning("Terminated as no document provided")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appStore/info.py DELETED
@@ -1,72 +0,0 @@
1
- import streamlit as st
2
-
3
- def app():
4
-
5
-
6
- with open('style.css') as f:
7
- st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
8
-
9
- st.markdown("<h2 style='text-align: center; \
10
- color: black;'> Policy Action Tracker</h2>",
11
- unsafe_allow_html=True)
12
-
13
-
14
- st.markdown("<div style='text-align: center; \
15
- color: grey;'>The Policy Action Tracker is an open-source\
16
- digital tool which aims to assist policy analysts and \
17
- other users in extracting and filtering relevant \
18
- information from policy documents.</div>",
19
- unsafe_allow_html=True)
20
- footer = """
21
- <div class="footer-custom">
22
- Guidance & Feedback - <a href="https://www.linkedin.com/in/maren-bernlöhr-149891222" target="_blank">Maren Bernlöhr</a> |
23
- <a href="https://www.linkedin.com/in/manuelkuhm" target="_blank">Manuel Kuhm</a> |
24
- Developer - <a href="https://www.linkedin.com/in/erik-lehmann-giz/" target="_blank">Erik Lehmann</a> |
25
- <a href="https://www.linkedin.com/in/jonas-nothnagel-bb42b114b/" target="_blank">Jonas Nothnagel</a> |
26
- <a href="https://www.linkedin.com/in/prashantpsingh/" target="_blank">Prashant Singh</a> |
27
-
28
- </div>
29
- """
30
- st.markdown(footer, unsafe_allow_html=True)
31
-
32
- c1, c2, c3 = st.columns([8,1,12])
33
- with c1:
34
- st.image("docStore/img/ndc.png")
35
- with c3:
36
- st.markdown('<div style="text-align: justify;">The manual extraction \
37
- of relevant information from text documents is a \
38
- time-consuming task for any policy analyst. As the amount and length of \
39
- public policy documents in relation to sustainable development (such as \
40
- National Development Plans and Nationally Determined Contributions) \
41
- continuously increases, a major challenge for policy action tracking – the \
42
- evaluation of stated goals and targets and their actual implementation on \
43
- the ground – arises. Luckily, Artificial Intelligence (AI) and Natural \
44
- Language Processing (NLP) methods can help in shortening and easing this \
45
- task for policy analysts.</div><br>',
46
- unsafe_allow_html=True)
47
-
48
- intro = """
49
- <div style="text-align: justify;">
50
-
51
- For this purpose, the United Nations Sustainable Development Solutions \
52
- Network (SDSN) and the Deutsche Gesellschaft für Internationale \
53
- Zusammenarbeit (GIZ) GmbH are collaborated in the development \
54
- of this AI-powered open-source web application that helps find and extract \
55
- relevant information from public policy documents faster to facilitate \
56
- evidence-based decision-making processes in sustainable development and beyond.
57
-
58
- This tool allows policy analysts and other users the possibility to rapidly \
59
- search for relevant information/paragraphs in the document according to the \
60
- user’s interest, classify the document’s content according to the Sustainable \
61
- Development Goals (SDGs), and compare climate-related policy documents and NDCs \
62
- across countries using open data from the German Institute of Development and \
63
- Sustainability’s (IDOS) NDC Explorer.
64
- To understand the application's functionalities and learn more about ß
65
- the project, see the attached concept note. We hope you like our application 😊
66
-
67
-
68
- </div>
69
- <br>
70
- """
71
- st.markdown(intro, unsafe_allow_html=True)
72
- # st.image("docStore/img/paris.png")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appStore/keyword_search.py DELETED
@@ -1,176 +0,0 @@
1
- # set path
2
- import glob, os, sys;
3
- sys.path.append('../utils')
4
-
5
- import streamlit as st
6
- import json
7
- import logging
8
- from utils.lexical_search import runLexicalPreprocessingPipeline, lexical_search
9
- from utils.semantic_search import runSemanticPreprocessingPipeline, semantic_keywordsearch
10
- from utils.checkconfig import getconfig
11
- from utils.streamlitcheck import checkbox_without_preselect
12
-
13
- # Declare all the necessary variables
14
- config = getconfig('paramconfig.cfg')
15
- split_by = config.get('semantic_search','SPLIT_BY')
16
- split_length = int(config.get('semantic_search','SPLIT_LENGTH'))
17
- split_overlap = int(config.get('semantic_search','SPLIT_OVERLAP'))
18
- split_respect_sentence_boundary = bool(int(config.get('semantic_search',
19
- 'RESPECT_SENTENCE_BOUNDARY')))
20
- remove_punc = bool(int(config.get('semantic_search','REMOVE_PUNC')))
21
- embedding_model = config.get('semantic_search','RETRIEVER')
22
- embedding_model_format = config.get('semantic_search','RETRIEVER_FORMAT')
23
- embedding_layer = int(config.get('semantic_search','RETRIEVER_EMB_LAYER'))
24
- embedding_dim = int(config.get('semantic_search','EMBEDDING_DIM'))
25
- max_seq_len = int(config.get('semantic_search','MAX_SEQ_LENGTH'))
26
- retriever_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
27
- reader_model = config.get('semantic_search','READER')
28
- reader_top_k = int(config.get('semantic_search','RETRIEVER_TOP_K'))
29
- top_k_per_candidate = int(config.get('semantic_search','READER_TOP_K_PER_CANDIDATE'))
30
- lexical_split_by= config.get('lexical_search','SPLIT_BY')
31
- lexical_split_length=int(config.get('lexical_search','SPLIT_LENGTH'))
32
- lexical_split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
33
- lexical_remove_punc = bool(int(config.get('lexical_search','REMOVE_PUNC')))
34
- lexical_top_k=int(config.get('lexical_search','TOP_K'))
35
-
36
- def app():
37
-
38
- with st.container():
39
- st.markdown("<h1 style='text-align: center; \
40
- color: black;'> Search</h1>",
41
- unsafe_allow_html=True)
42
- st.write(' ')
43
- st.write(' ')
44
-
45
- with st.expander("ℹ️ - About this app", expanded=False):
46
-
47
- st.write(
48
- """
49
- The *Search* app is an interface \
50
- for doing contextual and keyword searches in \
51
- policy documents. \
52
- """)
53
- st.write("")
54
- st.write(""" The application allows its user to perform a search\
55
- based on two options: a lexical search([TFIDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf))\
56
- and semantic search. [bi-encoder](https://www.sbert.net/examples/applications/retrieve_rerank/README.html)\
57
- The lexical search only \
58
- displays paragraphs in the document with exact matching results, \
59
- the semantic search shows paragraphs with meaningful connections \
60
- (e.g., synonyms) based on the search context. Both \
61
- methods employ a probabilistic retrieval framework in its identification\
62
- of relevant paragraphs. By defualt the search is performed using \
63
- 'Semantic Search', and to find 'Exact/Lexical Matches' please tick the \
64
- checkbox provided which will by-pass semantic search. Furthermore,\
65
- the application allows the user to search for pre-defined keywords \
66
- from different thematic buckets present in sidebar.""")
67
- st.write("")
68
- st.write(""" The Exact Matches gives back top {} findings, and Semantic
69
- search provides with top {} answers.""".format(lexical_top_k, retriever_top_k))
70
- st.write("")
71
- st.write("")
72
- st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
73
- col1,col2,col3= st.columns([2,4,4])
74
- with col1:
75
- st.caption("OCR File processing")
76
- # st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
77
- st.write("50 sec")
78
-
79
- with col2:
80
- st.caption("Lexical Search on 200 paragraphs(~ 35 pages)")
81
- # st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
82
- st.write("15 sec")
83
-
84
- with col3:
85
- st.caption("Semantic search on 200 paragraphs(~ 35 pages)")
86
- # st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
87
- st.write("120 sec(including emebedding creation)")
88
-
89
- with st.sidebar:
90
- with open('docStore/sample/keywordexample.json','r') as json_file:
91
- keywordexample = json.load(json_file)
92
-
93
- # genre = st.radio("Select Keyword Category", list(keywordexample.keys()))
94
- st.caption("Select Keyword Category")
95
- genre = checkbox_without_preselect(list(keywordexample.keys()))
96
- if genre:
97
- keywordList = keywordexample[genre]
98
- else:
99
- keywordList = None
100
-
101
- st.markdown("---")
102
-
103
- with st.container():
104
- type_hinting = "Please enter here your question and we \
105
- will look for an answer in the document\
106
- OR enter the keyword you are looking \
107
- for and we will look for similar\
108
- context in the document.\
109
- You can also explore predefined sets of keywords from sidebar. "
110
- if keywordList is not None:
111
- # queryList = st.text_input("You selected the {} category we \
112
- # will look for these keywords in document".format(genre)
113
- # value="{}".format(keywordList))
114
- queryList = st.text_input(type_hinting,
115
- value = "{}".format(keywordList))
116
- else:
117
- queryList = st.text_input(type_hinting,
118
- placeholder="Enter keyword/query here")
119
-
120
- searchtype = st.checkbox("Show only Exact Matches")
121
- if st.button("Find them"):
122
-
123
- if queryList == "":
124
- st.info("🤔 No keyword provided, if you dont have any, \
125
- please try example sets from sidebar!")
126
- logging.warning("Terminated as no keyword provided")
127
- else:
128
- if 'filepath' in st.session_state:
129
-
130
- if searchtype:
131
- all_documents = runLexicalPreprocessingPipeline(
132
- file_name=st.session_state['filename'],
133
- file_path=st.session_state['filepath'],
134
- split_by=lexical_split_by,
135
- split_length=lexical_split_length,
136
- split_overlap=lexical_split_overlap,
137
- remove_punc=lexical_remove_punc)
138
- logging.info("performing lexical search")
139
- with st.spinner("Performing Exact matching search \
140
- (Lexical search) for you"):
141
- lexical_search(query=queryList,
142
- documents = all_documents['documents'],
143
- top_k = lexical_top_k )
144
- else:
145
- all_documents = runSemanticPreprocessingPipeline(
146
- file_path= st.session_state['filepath'],
147
- file_name = st.session_state['filename'],
148
- split_by=split_by,
149
- split_length= split_length,
150
- split_overlap=split_overlap,
151
- remove_punc= remove_punc,
152
- split_respect_sentence_boundary=split_respect_sentence_boundary)
153
- if len(all_documents['documents']) > 100:
154
- warning_msg = ": This might take sometime, please sit back and relax."
155
- else:
156
- warning_msg = ""
157
-
158
- logging.info("starting semantic search")
159
- with st.spinner("Performing Similar/Contextual search{}".format(warning_msg)):
160
- semantic_keywordsearch(query = queryList,
161
- documents = all_documents['documents'],
162
- embedding_model=embedding_model,
163
- embedding_layer=embedding_layer,
164
- embedding_model_format=embedding_model_format,
165
- reader_model=reader_model,reader_top_k=reader_top_k,
166
- retriever_top_k=retriever_top_k, embedding_dim=embedding_dim,
167
- max_seq_len=max_seq_len,
168
- top_k_per_candidate = top_k_per_candidate)
169
-
170
- else:
171
- st.info("🤔 No document found, please try to upload it at the sidebar!")
172
- logging.warning("Terminated as no document provided")
173
-
174
-
175
-
176
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appStore/multiapp.py DELETED
@@ -1,70 +0,0 @@
1
- """Frameworks for running multiple Streamlit applications as a single app.
2
- """
3
- import streamlit as st
4
- from PIL import Image
5
- from streamlit_option_menu import option_menu
6
- from utils.uploadAndExample import add_upload
7
-
8
- class MultiApp:
9
- """Framework for combining multiple streamlit applications.
10
- Usage:
11
- def foo():
12
- st.title("Hello Foo")
13
- def bar():
14
- st.title("Hello Bar")
15
- app = MultiApp()
16
- app.add_app("Foo", foo)
17
- app.add_app("Bar", bar)
18
- app.run()
19
- It is also possible keep each application in a separate file.
20
- import foo
21
- import bar
22
- app = MultiApp()
23
- app.add_app("Foo", foo.app)
24
- app.add_app("Bar", bar.app)
25
- app.run()
26
- """
27
- def __init__(self):
28
- self.apps = []
29
-
30
- def add_app(self,title,icon, func):
31
- """Adds a new application.
32
- Parameters
33
- ----------
34
- func:
35
- the python function to render this app.
36
- title:
37
- title of the app. Appears in the dropdown in the sidebar.
38
- """
39
- self.apps.append({
40
- "title": title,
41
- "icon": icon,
42
- "function": func
43
- })
44
-
45
- def run(self):
46
-
47
- st.sidebar.write(format_func=lambda app: app['title'])
48
- image = Image.open('docStore/img/sdsn.png')
49
- st.sidebar.image(image, width =200)
50
-
51
- with st.sidebar:
52
- selected = option_menu(None, [page["title"] for page in self.apps],
53
- icons=[page["icon"] for page in self.apps],
54
- menu_icon="cast", default_index=0)
55
- st.markdown("---")
56
-
57
-
58
- for index, item in enumerate(self.apps):
59
- if item["title"] == selected:
60
- self.apps[index]["function"]()
61
- break
62
-
63
-
64
- choice = st.sidebar.radio(label = 'Select the Document',
65
- help = 'You can upload the document \
66
- or else you can try a example document',
67
- options = ('Upload Document', 'Try Example'),
68
- horizontal = True)
69
- add_upload(choice)
70
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appStore/sdg_analysis.py DELETED
@@ -1,179 +0,0 @@
1
- # set path
2
- import glob, os, sys;
3
- sys.path.append('../utils')
4
-
5
- #import needed libraries
6
- import seaborn as sns
7
- import matplotlib.pyplot as plt
8
- import numpy as np
9
- import pandas as pd
10
- import streamlit as st
11
- from st_aggrid import AgGrid
12
- from st_aggrid.shared import ColumnsAutoSizeMode
13
- from utils.sdg_classifier import sdg_classification
14
- from utils.sdg_classifier import runSDGPreprocessingPipeline, load_sdgClassifier
15
- from utils.keyword_extraction import textrank
16
- import logging
17
- logger = logging.getLogger(__name__)
18
- from utils.checkconfig import getconfig
19
-
20
-
21
- # Declare all the necessary variables
22
- config = getconfig('paramconfig.cfg')
23
- model_name = config.get('sdg','MODEL')
24
- split_by = config.get('sdg','SPLIT_BY')
25
- split_length = int(config.get('sdg','SPLIT_LENGTH'))
26
- split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
27
- remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
28
- split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
29
- threshold = float(config.get('sdg','THRESHOLD'))
30
- top_n = int(config.get('sdg','TOP_KEY'))
31
-
32
-
33
- def app():
34
-
35
- #### APP INFO #####
36
- with st.container():
37
- st.markdown("<h1 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h1>", unsafe_allow_html=True)
38
- st.write(' ')
39
- st.write(' ')
40
-
41
- with st.expander("ℹ️ - About this app", expanded=False):
42
-
43
- st.write(
44
- """
45
- The *SDG Analysis* app is an easy-to-use interface built \
46
- in Streamlit for analyzing policy documents with respect to SDG \
47
- Classification for the paragraphs/texts in the document and \
48
- extracting the keyphrase per SDG label - developed by GIZ Data \
49
- and the Sustainable Development Solution Network. \n
50
- """)
51
- st.write("""**Document Processing:** The Uploaded/Selected document is \
52
- automatically cleaned and split into paragraphs with a maximum \
53
- length of 120 words using a Haystack preprocessing pipeline. The \
54
- length of 120 is an empirical value which should reflect the length \
55
- of a “context” and should limit the paragraph length deviation. \
56
- However, since we want to respect the sentence boundary the limit \
57
- can breach and hence this limit of 120 is tentative. \n
58
- """)
59
- st.write("""**SDG cLassification:** The application assigns paragraphs \
60
- to 16 of the 17 United Nations Sustainable Development Goals (SDGs).\
61
- SDG 17 “Partnerships for the Goals” is excluded from the analysis due \
62
- to its broad nature which could potentially inflate the results. \
63
- Each paragraph is assigned to one SDG only. Again, the results are \
64
- displayed in a summary table including the number of the SDG, a \
65
- relevancy score highlighted through a green color shading, and the \
66
- respective text of the analyzed paragraph. Additionally, a pie \
67
- chart with a blue color shading is displayed which illustrates the \
68
- three most prominent SDGs in the document. The SDG classification \
69
- uses open-source training [data](https://zenodo.org/record/5550238#.Y25ICHbMJPY) \
70
- from [OSDG.ai](https://osdg.ai/) which is a global \
71
- partnerships and growing community of researchers and institutions \
72
- interested in the classification of research according to the \
73
- Sustainable Development Goals. The summary table only displays \
74
- paragraphs with a calculated relevancy score above 85%. \n""")
75
-
76
- st.write("""**Keyphrase Extraction:** The application extracts 15 \
77
- keyphrases from the document, for each SDG label and displays the \
78
- results in a summary table. The keyphrases are extracted using \
79
- using [Textrank](https://github.com/summanlp/textrank)\
80
- which is an easy-to-use computational less expensive \
81
- model leveraging combination of TFIDF and Graph networks.
82
- """)
83
- st.write("")
84
- st.write("")
85
- st.markdown("Some runtime metrics tested with cpu: Intel(R) Xeon(R) CPU @ 2.20GHz, memory: 13GB")
86
- col1,col2,col3,col4 = st.columns([2,2,4,4])
87
- with col1:
88
- st.caption("Loading Time Classifier")
89
- # st.markdown('<div style="text-align: center;">12 sec</div>', unsafe_allow_html=True)
90
- st.write("12 sec")
91
- with col2:
92
- st.caption("OCR File processing")
93
- # st.markdown('<div style="text-align: center;">50 sec</div>', unsafe_allow_html=True)
94
- st.write("50 sec")
95
- with col3:
96
- st.caption("SDG Classification of 200 paragraphs(~ 35 pages)")
97
- # st.markdown('<div style="text-align: center;">120 sec</div>', unsafe_allow_html=True)
98
- st.write("120 sec")
99
- with col4:
100
- st.caption("Keyword extraction for 200 paragraphs(~ 35 pages)")
101
- # st.markdown('<div style="text-align: center;">3 sec</div>', unsafe_allow_html=True)
102
- st.write("3 sec")
103
-
104
-
105
-
106
-
107
- ### Main app code ###
108
- with st.container():
109
- if st.button("RUN SDG Analysis"):
110
-
111
- if 'filepath' in st.session_state:
112
- file_name = st.session_state['filename']
113
- file_path = st.session_state['filepath']
114
- classifier = load_sdgClassifier(classifier_name=model_name)
115
- st.session_state['sdg_classifier'] = classifier
116
- all_documents = runSDGPreprocessingPipeline(file_name= file_name,
117
- file_path= file_path, split_by= split_by,
118
- split_length= split_length,
119
- split_respect_sentence_boundary= split_respect_sentence_boundary,
120
- split_overlap= split_overlap, remove_punc= remove_punc)
121
-
122
- if len(all_documents['documents']) > 100:
123
- warning_msg = ": This might take sometime, please sit back and relax."
124
- else:
125
- warning_msg = ""
126
-
127
- with st.spinner("Running SDG Classification{}".format(warning_msg)):
128
-
129
- df, x = sdg_classification(haystack_doc=all_documents['documents'],
130
- threshold= threshold)
131
- df = df.drop(['Relevancy'], axis = 1)
132
- sdg_labels = x.SDG.unique()
133
- textrank_keyword_list = []
134
- for label in sdg_labels:
135
- sdgdata = " ".join(df[df.SDG == label].text.to_list())
136
- textranklist_ = textrank(textdata=sdgdata, words= top_n)
137
- if len(textranklist_) > 0:
138
- textrank_keyword_list.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
139
- textrank_keywords_df = pd.DataFrame(textrank_keyword_list)
140
-
141
-
142
- plt.rcParams['font.size'] = 25
143
- colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
144
- # plot
145
- fig, ax = plt.subplots()
146
- ax.pie(x['count'], colors=colors, radius=2, center=(4, 4),
147
- wedgeprops={"linewidth": 1, "edgecolor": "white"},
148
- textprops={'fontsize': 14},
149
- frame=False,labels =list(x.SDG_Num),
150
- labeldistance=1.2)
151
- # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
152
-
153
-
154
- st.markdown("#### Anything related to SDGs? ####")
155
-
156
- c4, c5, c6 = st.columns([1,2,2])
157
-
158
- with c5:
159
- st.pyplot(fig)
160
- with c6:
161
- labeldf = x['SDG_name'].values.tolist()
162
- labeldf = "<br>".join(labeldf)
163
- st.markdown(labeldf, unsafe_allow_html=True)
164
- st.write("")
165
- st.markdown("###### What keywords are present under SDG classified text? ######")
166
-
167
- AgGrid(textrank_keywords_df, reload_data = False,
168
- update_mode="value_changed",
169
- columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
170
- st.write("")
171
- st.markdown("###### Top few SDG Classified paragraph/text results ######")
172
-
173
- AgGrid(df, reload_data = False, update_mode="value_changed",
174
- columns_auto_size_mode = ColumnsAutoSizeMode.FIT_CONTENTS)
175
- else:
176
- st.info("🤔 No document found, please try to upload it at the sidebar!")
177
- logging.warning("Terminated as no document provided")
178
-
179
-