Jan Mühlnikel commited on
Commit
fd7cbe7
·
1 Parent(s): a2d83b8

move all to one app page

Browse files
__pycache__/similarity.cpython-310.pyc CHANGED
Binary files a/__pycache__/similarity.cpython-310.pyc and b/__pycache__/similarity.cpython-310.pyc differ
 
__pycache__/similarity_page.cpython-310.pyc ADDED
Binary file (3.96 kB). View file
 
home.py DELETED
@@ -1,4 +0,0 @@
1
- import streamlit as st
2
-
3
- def show_page():
4
- st.write("home")
 
 
 
 
 
sdg.py DELETED
File without changes
sector.py DELETED
@@ -1,225 +0,0 @@
1
- """
2
- Page to analyse the link between crs codes, countries and organizations
3
- """
4
-
5
- ################
6
- # DEPENDENCIES #
7
- ################
8
- import streamlit as st
9
- import pandas as pd
10
- import utils.crs_table as crs_table
11
- import utils.sdg_table as sdg_table
12
- import utils.filter_modules as filter_modules
13
- """
14
- from importlib.machinery import SourceFileLoader
15
- crs_overlap = SourceFileLoader("crs_overlap", "data/models/crs_overlap.py").load_module()
16
- sdg_overlap = SourceFileLoader("sdg_overlap", "data/models/sdg_overlap.py").load_module()
17
- CONSTANTS = SourceFileLoader("CONSTANTS", "config/CONSTANTS.py").load_module()
18
-
19
- # CHACHE DATA
20
- # FETCH NEEDED DATA AND STORE IN CHACHE MEMORY TO SAVE LOADING TIME
21
- @st.cache_data
22
- def getCRS3():
23
- # Read in CRS3 CODELISTS
24
- crs3_df = pd.read_csv('app/src/codelists/crs3_codes.csv')
25
- CRS3_CODES = crs3_df['code'].tolist()
26
- CRS3_NAME = crs3_df['name'].tolist()
27
- CRS3_MERGED = {f"{name} - {code}": code for name, code in zip(CRS3_NAME, CRS3_CODES)}
28
-
29
- return CRS3_MERGED
30
-
31
- @st.cache_data
32
- def getCRS5():
33
- # Read in CRS3 CODELISTS
34
- crs5_df = pd.read_csv('app/src/codelists/crs5_codes.csv')
35
- CRS5_CODES = crs5_df['code'].tolist()
36
- CRS5_NAME = crs5_df['name'].tolist()
37
- CRS5_MERGED = {code: [f"{name} - {code}"] for name, code in zip(CRS5_NAME, CRS5_CODES)}
38
-
39
- return CRS5_MERGED
40
-
41
- @st.cache_data
42
- def getSDG():
43
- # Read in SDG CODELISTS
44
- sdg_df = pd.read_csv('app/src/codelists/sdg_goals.csv')
45
- SDG_NAMES = sdg_df['name'].tolist()
46
-
47
- return SDG_NAMES
48
-
49
- @st.cache_data
50
- def getCountry():
51
- # Read in countries from codelist
52
- country_df = pd.read_csv('app/src/codelists/country_codes_ISO3166-1alpha-2.csv')
53
- COUNTRY_CODES = country_df['Alpha-2 code'].tolist()
54
- COUNTRY_NAMES = country_df['Country'].tolist()
55
-
56
- return country_df, COUNTRY_CODES, COUNTRY_NAMES
57
-
58
- CRS3_MERGED = getCRS3()
59
- CRS5_MERGED = getCRS5()
60
- SDG_NAMES = getSDG()
61
- country_df, COUNTRY_CODES, COUNTRY_NAMES = getCountry()
62
-
63
- # SPECIAL SELECTIONS
64
- ## COUNTRY
65
- SPECIAL_COUNTRY_SLECTIONS = ["All"]
66
- SHOW_ALL_COUNTRIES = False # If all countries should be showed in matching
67
-
68
- ## ORGANIZATION
69
- SPECIAL_ORGA_SLECTIONS = ["All"]
70
- SHOW_ALL_ORGAS = False
71
- """
72
- ########
73
- # PAGE #
74
- ########
75
- def show_page():
76
-
77
- """
78
- def show_crs():
79
- # SESSION STATES
80
- st.session_state.crs5_option_disabled = True
81
-
82
- # SELECTION FIELDS
83
- col1, col2 = st.columns([1, 1])
84
- with col1:
85
- #####################
86
- # CRS 3 CODE SELECT #
87
- #####################
88
- crs3_option = st.multiselect(
89
- 'CRS 3',
90
- CRS3_MERGED,
91
- placeholder="Select"
92
- )
93
-
94
- #####################
95
- # CRS 5 CODE SELECT #
96
- #####################
97
- # Only enable crs5 select field when crs3 code is selected
98
- if crs3_option != []:
99
- st.session_state.crs5_option_disabled = False
100
-
101
- # define list of crs5 codes dependend on crs3 codes
102
- crs5_list = [txt[0].replace('"', "") for crs3_item in crs3_option for code, txt in CRS5_MERGED.items() if str(code)[:3] == str(crs3_item)[-3:]]
103
-
104
- # crs5 select field
105
- crs5_option = st.multiselect(
106
- 'CRS 5',
107
- crs5_list,
108
- placeholder="Select",
109
- disabled=st.session_state.crs5_option_disabled
110
- )
111
-
112
- with col2:
113
- # COUNTRY SELECTION
114
- country_option = filter_modules.country_option(SPECIAL_COUNTRY_SLECTIONS, COUNTRY_NAMES)
115
-
116
- # ORGA SELECTION
117
- orga_option = filter_modules.orga_option(SPECIAL_ORGA_SLECTIONS, CONSTANTS.ORGA_SEARCH)
118
-
119
- ################
120
- # SHOW RESULTS #
121
- ################
122
- # Extract Orgas from multiselect
123
- if "All" in orga_option:
124
- SHOW_ALL_ORGAS = True
125
- selected_orgas = []
126
- else:
127
- SHOW_ALL_ORGAS = False
128
- selected_orgas = [str(o).replace(")", "").lower().split("(")[1] for o in orga_option]
129
-
130
- if country_option != []:
131
- # all selection
132
- if "All" in country_option:
133
- SHOW_ALL_COUNTRIES = True
134
- country_option.remove("All")
135
- else:
136
- SHOW_ALL_COUNTRIES = False
137
-
138
- if crs3_option != []:
139
- # CRS 3 codes from option
140
- crs3_list = [i[-3:] for i in crs3_option]
141
-
142
- # get country codes from multiselect
143
- country_names = [str(c) for c in country_option]
144
- country_codes = [
145
- country_df[country_df['Country'] == c]['Alpha-2 code'].values[0].replace('"', "").strip(" ")
146
- for c in country_names
147
- ]
148
-
149
- result_df = crs_overlap.calc_crs3(crs3_list, country_codes, selected_orgas, SHOW_ALL_COUNTRIES, SHOW_ALL_ORGAS)
150
-
151
- if crs5_option != []:
152
- # CRS 5 codes from option
153
- crs5_list = [i[-5:] for i in crs5_option]
154
- result_df = crs_overlap.calc_crs5(crs5_list, country_codes, selected_orgas, SHOW_ALL_COUNTRIES, SHOW_ALL_ORGAS)
155
-
156
- # TABLE FOR CRS OVERLAP
157
- crs_table.show_table(result_df)
158
-
159
- def show_sdg():
160
- # SELECTION
161
- col1, col2 = st.columns([1, 1])
162
- with col1:
163
- # CRS3 CODE SELECT
164
- sdg_option = st.selectbox(
165
- label = 'SDG',
166
- index = None,
167
- placeholder = "Select SDG",
168
- options = SDG_NAMES,
169
- )
170
-
171
- with col2:
172
- # COUNTRY SELECTION
173
- country_option = filter_modules.country_option(SPECIAL_COUNTRY_SLECTIONS, COUNTRY_NAMES)
174
-
175
- # ORGA SELECTION
176
- orga_option = filter_modules.orga_option(SPECIAL_ORGA_SLECTIONS, CONSTANTS.ORGA_SEARCH)
177
-
178
-
179
- # SHOW RESULTS
180
- if sdg_option != None:
181
- sdg_int = int(sdg_option.split(" ")[0].replace(".", ""))
182
- # Extract Orgas from multiselect
183
- if "All" in orga_option:
184
- SHOW_ALL_ORGAS = True
185
- selected_orgas = []
186
- else:
187
- SHOW_ALL_ORGAS = False
188
- selected_orgas = [str(o).replace(")", "").lower().split("(")[1] for o in orga_option]
189
-
190
- if country_option != []:
191
- # all selection
192
- if "All" in country_option:
193
- SHOW_ALL_COUNTRIES = True
194
- country_option.remove("All")
195
- else:
196
- SHOW_ALL_COUNTRIES = False
197
-
198
- country_names = [str(c) for c in country_option]
199
- country_codes = [
200
- country_df[country_df['Country'] == c]['Alpha-2 code'].values[0].replace('"', "").strip(" ")
201
- for c in country_names
202
- ]
203
-
204
- result_df = sdg_overlap.calc_crs3(sdg_int, country_codes, selected_orgas, SHOW_ALL_COUNTRIES, SHOW_ALL_ORGAS)
205
-
206
- # TABLE FOR SDG OVERLAP
207
- sdg_table.show_table(result_df)
208
-
209
- # SELECT IF CRS or SDG Match
210
- match_option = st.selectbox(
211
- label = 'Matching Method',
212
- index = 0,
213
- placeholder = "Select",
214
- options = ["CRS", "SDG"],
215
- )
216
-
217
- st.write("------------------")
218
-
219
- if match_option == "CRS":
220
- show_crs()
221
- elif match_option == "SDG":
222
- show_sdg()
223
-
224
- """
225
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
similarity.py DELETED
@@ -1,112 +0,0 @@
1
- """
2
- Page for similarities
3
- """
4
-
5
- ################
6
- # DEPENDENCIES #
7
- ################
8
- import streamlit as st
9
- import pandas as pd
10
- from scipy.sparse import load_npz
11
- import pickle
12
- import faiss
13
- from sentence_transformers import SentenceTransformer
14
- import utils.similarity_table as similarity_table
15
- import utils.semantic_search as semantic_search
16
- import psutil
17
- import os
18
-
19
- def get_process_memory():
20
- process = psutil.Process(os.getpid())
21
- return process.memory_info().rss / (1024 * 1024)
22
-
23
- # Catch DATA
24
- # Load Similarity matrix
25
- @st.cache_data
26
- def load_sim_matrix():
27
- loaded_matrix = load_npz("src/similarities.npz")
28
- dense_matrix = loaded_matrix.toarray()
29
-
30
- return dense_matrix
31
-
32
- @st.cache_data
33
- def load_projects():
34
- orgas_df = pd.read_csv("src/projects/project_orgas.csv")
35
- region_df = pd.read_csv("src/projects/project_region.csv")
36
- sector_df = pd.read_csv("src/projects/project_sector.csv")
37
- status_df = pd.read_csv("src/projects/project_status.csv")
38
- texts_df = pd.read_csv("src/projects/project_texts.csv")
39
-
40
- projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
41
- projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
42
- projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
43
- projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
44
-
45
- return projects_df
46
-
47
- @st.cache_resource
48
- def load_model():
49
- model = SentenceTransformer('all-MiniLM-L6-v2')
50
- return model
51
-
52
- # LOAD EMBEDDINGS
53
- @st.cache_data
54
- def load_embeddings_and_index():
55
- # Load embeddings
56
- with open("src/embeddings.pkl", "rb") as fIn:
57
- stored_data = pickle.load(fIn)
58
- sentences = stored_data["sentences"]
59
- embeddings = stored_data["embeddings"]
60
-
61
- # Load or create FAISS index
62
- dimension = embeddings.shape[1]
63
- faiss_index = faiss.IndexFlatL2(dimension)
64
- faiss_index.add(embeddings)
65
-
66
- return sentences, embeddings, faiss_index
67
-
68
- # LOAD DATA
69
- sim_matrix = load_sim_matrix()
70
- projects_df = load_projects()
71
- model = load_model()
72
- sentences, embeddings, faiss_index = load_embeddings_and_index()
73
-
74
- def show_page():
75
- st.write(f"Current RAM usage of this app: {get_process_memory():.2f} MB")
76
- st.write("Similarities")
77
-
78
- semantic_search.show_search(model, faiss_index, sentences)
79
-
80
- df_subset = projects_df.head(10)
81
- selected_index = st.selectbox('Select an entry', df_subset.index, format_func=lambda x: df_subset.loc[x, 'iati_id'])
82
-
83
- st.write(selected_index)
84
-
85
- # add index and similarity together
86
- indecies = range(0, len(sim_matrix))
87
- similarities = sim_matrix[selected_index]
88
- zipped_sims = list(zip(indecies, similarities))
89
-
90
- # remove all 0 similarities
91
- filtered_sims = [(index, similarity) for index, similarity in zipped_sims if similarity != 0]
92
-
93
- # Select and sort top 20 most similar projects
94
- sorted_sims = sorted(filtered_sims, key=lambda x: x[1], reverse=True)
95
- top_20_sims = sorted_sims[:20]
96
-
97
- # create result data frame
98
- index_list = [tup[0] for tup in top_20_sims]
99
- print(index_list)
100
- result_df = projects_df.iloc[index_list]
101
- print(len(result_df))
102
-
103
- print(len(result_df))
104
- # add other colums to result df
105
-
106
- similarity_list = [tup[1] for tup in top_20_sims]
107
- result_df["similarity"] = similarity_list
108
-
109
- similarity_table.show_table(result_df, similarity_list)
110
-
111
-
112
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/__pycache__/crs_table.cpython-310.pyc DELETED
Binary file (1.21 kB)
 
utils/__pycache__/filter_modules.cpython-310.pyc DELETED
Binary file (997 Bytes)
 
utils/__pycache__/navbar.cpython-310.pyc DELETED
Binary file (1.14 kB)
 
utils/__pycache__/sdg_table.cpython-310.pyc DELETED
Binary file (1.19 kB)
 
utils/__pycache__/semantic_search.cpython-310.pyc DELETED
Binary file (825 Bytes)
 
utils/__pycache__/similarity_table.cpython-310.pyc DELETED
Binary file (1.41 kB)
 
utils/crs_table.py DELETED
@@ -1,49 +0,0 @@
1
- import streamlit as st
2
-
3
- def show_table(data_df):
4
- st.write("------------------")
5
-
6
- st.dataframe(
7
- data_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "crs_3_code", "crs_5_code"]],
8
- use_container_width = True,
9
- height = 35 + 35 * len(data_df),
10
- column_config={
11
- "orga_abbreviation": st.column_config.TextColumn(
12
- "Organization",
13
- help="If description not in English, description in other language provided",
14
- disabled=True
15
- ),
16
- "client": st.column_config.TextColumn(
17
- "Client",
18
- help="Client organization of customer",
19
- disabled=True
20
- ),
21
- "title_main": st.column_config.TextColumn(
22
- "Title",
23
- help="If title not in English, title in other language provided",
24
- disabled=True
25
- ),
26
- "description_main": st.column_config.TextColumn(
27
- "Description",
28
- help="If description not in English, description in other language provided",
29
- disabled=True
30
- ),
31
- "country": st.column_config.TextColumn(
32
- "Country",
33
- help="Country of project",
34
- disabled=True
35
- ),
36
- "crs_3_code": st.column_config.TextColumn(
37
- "CRS 3",
38
- help="CRS 3",
39
- disabled=True
40
- ),
41
- "crs_5_code": st.column_config.TextColumn(
42
- "CRS 5",
43
- help="CRS 5",
44
- disabled=True
45
- ),
46
-
47
- },
48
- hide_index=True,
49
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/filter_modules.py DELETED
@@ -1,21 +0,0 @@
1
- import pandas as pd
2
- import streamlit as st
3
-
4
- def country_option(special_cases, country_names):
5
- country_option = st.multiselect(
6
- 'Country / Countries',
7
- special_cases + country_names,
8
- placeholder="Select"
9
- )
10
-
11
- return country_option
12
-
13
- def orga_option(special_cases, orga_names):
14
- orga_list = special_cases + [f"{v[0]} ({k})" for k, v in orga_names.items()]
15
- orga_option = st.multiselect(
16
- 'Development Bank / Organization',
17
- orga_list,
18
- placeholder="Select"
19
- )
20
-
21
- return orga_option
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/navbar.py DELETED
@@ -1,50 +0,0 @@
1
- import streamlit as st
2
- from streamlit_option_menu import option_menu # https://github.com/victoryhb/streamlit-option-menu
3
-
4
- # giz-dsc colors
5
- # orange: #e5b50d
6
- # green: #48d47b
7
- # blue: #0da2dc
8
- # grey: #dadada
9
-
10
- # giz colors https://www.giz.de/cdc/en/html/59638.html
11
- # red: #c80f0f
12
- # grey: #6f6f6f
13
- # light_grey: #b2b2b2
14
- # light_red: #eba1a3
15
-
16
- def show_navbar():
17
- st.markdown("<h1 style='color: red;'>THIS APP IS WORK IN PROGRESS ...</h1>", unsafe_allow_html=True)
18
-
19
- navbar = option_menu(None, ["Home", "Sector Matches", 'Similarity Matches'],
20
- icons=['house', 'list-task', "list-task", 'list-task'],
21
- menu_icon="cast", default_index=0, orientation="horizontal",
22
- styles={
23
- "container": {
24
- "padding": "0!important",
25
- "background-color": "#F0F0F0"
26
- },
27
- "icon": {
28
- "color": "#c80f0f",
29
- "font-size": "25px"
30
- },
31
- "nav-link": {
32
- "font-size": "25px",
33
- "text-align": "left",
34
- "margin":"0px",
35
- "--hover-color": "#b2b2b2"
36
- },
37
- "nav-link-selected": {
38
- "background-color": "#F0F0F0"
39
- },
40
- "nav-link-text": {
41
- "color": "#333333"
42
- },
43
-
44
- "icon-active": {
45
- "color": "#dadada"
46
- }
47
- }
48
- )
49
-
50
- return navbar
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/sdg_table.py DELETED
@@ -1,43 +0,0 @@
1
- import streamlit as st
2
-
3
- def show_table(data_df):
4
- st.write("------------------")
5
-
6
- st.dataframe(
7
- data_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code"]],
8
- use_container_width = True,
9
- height = 35 + 35 * len(data_df),
10
- column_config={
11
- "orga_abbreviation": st.column_config.TextColumn(
12
- "Organization",
13
- help="If description not in English, description in other language provided",
14
- disabled=True
15
- ),
16
- "client": st.column_config.TextColumn(
17
- "Client",
18
- help="Client organization of customer",
19
- disabled=True
20
- ),
21
- "title_main": st.column_config.TextColumn(
22
- "Title",
23
- help="If title not in English, title in other language provided",
24
- disabled=True
25
- ),
26
- "description_main": st.column_config.TextColumn(
27
- "Description",
28
- help="If description not in English, description in other language provided",
29
- disabled=True
30
- ),
31
- "country": st.column_config.TextColumn(
32
- "Country",
33
- help="Country of project",
34
- disabled=True
35
- ),
36
- "sgd_pred_code": st.column_config.TextColumn(
37
- "SDG Prediction",
38
- help="Prediction of SDG's",
39
- disabled=True
40
- ),
41
- },
42
- hide_index=True,
43
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/semantic_search.py DELETED
@@ -1,19 +0,0 @@
1
- import pickle
2
- import faiss
3
- import streamlit as st
4
- from sentence_transformers import SentenceTransformer
5
-
6
- def show_search(model, faiss_index, sentences):
7
- query = st.text_input("Enter your search query:")
8
-
9
- if query:
10
- # Convert query to embedding
11
- query_embedding = model.encode([query])[0].reshape(1, -1)
12
-
13
- # Perform search
14
- D, I = faiss_index.search(query_embedding, k=5) # Search for top 5 similar items
15
-
16
- # Display results
17
- st.write("Top results:")
18
- for i in I[0]:
19
- st.write(sentences[i])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
utils/similarity_table.py DELETED
@@ -1,53 +0,0 @@
1
- import streamlit as st
2
-
3
- def show_table(data_df, similarities:list):
4
- st.write("------------------")
5
-
6
- st.dataframe(
7
- data_df[["title_main", "orga_abbreviation", "client", "description_main", "country", "sgd_pred_code", "crs_3_code", "crs_5_code", "similarity"]],
8
- use_container_width = True,
9
- height = 35 + 35 * len(data_df),
10
- column_config={
11
- "orga_abbreviation": st.column_config.TextColumn(
12
- "Organization",
13
- help="If description not in English, description in other language provided",
14
- disabled=True
15
- ),
16
- "client": st.column_config.TextColumn(
17
- "Client",
18
- help="Client organization of customer",
19
- disabled=True
20
- ),
21
- "title_main": st.column_config.TextColumn(
22
- "Title",
23
- help="If title not in English, title in other language provided",
24
- disabled=True
25
- ),
26
- "description_main": st.column_config.TextColumn(
27
- "Description",
28
- help="If description not in English, description in other language provided",
29
- disabled=True
30
- ),
31
- "country": st.column_config.TextColumn(
32
- "Country",
33
- help="Country of project",
34
- disabled=True
35
- ),
36
- "sgd_pred_code": st.column_config.TextColumn(
37
- "SDG Prediction",
38
- help="Prediction of SDG's",
39
- disabled=True
40
- ),
41
- "crs_3_code": st.column_config.TextColumn(
42
- "CRS 3",
43
- help="CRS 3 code given by organization",
44
- disabled=True
45
- ),
46
- "crs_5_code": st.column_config.TextColumn(
47
- "CRS 5",
48
- help="CRS 5 code given by organization",
49
- disabled=True
50
- ),
51
- },
52
- hide_index=True,
53
- )