Jan Mühlnikel
commited on
Commit
·
7d8805d
1
Parent(s):
09c16ce
enhanced documentation
Browse files
functions/{calc_matches.py → multi_project_matching.py}
RENAMED
@@ -1,24 +1,35 @@
|
|
1 |
-
import pandas as pd
|
2 |
import numpy as np
|
3 |
-
from scipy.sparse import csr_matrix
|
4 |
-
import streamlit as st
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
if not isinstance(similarity_matrix, csr_matrix):
|
10 |
similarity_matrix = csr_matrix(similarity_matrix)
|
11 |
|
|
|
12 |
filtered_indices = filtered_df.index.to_list()
|
13 |
project_indices = project_df.index.to_list()
|
14 |
|
|
|
15 |
match_matrix = similarity_matrix[project_indices, :][:, filtered_indices] # row / column
|
16 |
-
|
17 |
dense_match_matrix = match_matrix.toarray()
|
18 |
-
|
19 |
flat_matrix = dense_match_matrix.flatten()
|
20 |
|
21 |
-
#
|
22 |
top_15_indices = np.argsort(flat_matrix)[-top_x:]
|
23 |
|
24 |
# Convert flat indices back to 2D indices
|
@@ -28,7 +39,6 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
|
28 |
top_15_values = flat_matrix[top_15_indices]
|
29 |
|
30 |
# Prepare the result with row and column indices from original dataframes
|
31 |
-
top_15_matches = []
|
32 |
org_rows = []
|
33 |
org_cols = []
|
34 |
for value, row, col in zip(top_15_values, top_15_2d_indices[0], top_15_2d_indices[1]):
|
@@ -36,14 +46,24 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
|
36 |
original_col_index = filtered_indices[col]
|
37 |
org_rows.append(original_row_index)
|
38 |
org_cols.append(original_col_index)
|
39 |
-
top_15_matches.append((value, original_row_index, original_col_index))
|
40 |
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
p1_df = filtered_df.loc[org_cols].copy()
|
43 |
p1_df['similarity'] = top_15_values
|
44 |
|
45 |
p2_df = project_df.loc[org_rows].copy()
|
46 |
p2_df['similarity'] = top_15_values
|
47 |
-
print("finished calc matches")
|
48 |
|
|
|
49 |
return p1_df, p2_df
|
|
|
|
|
1 |
import numpy as np
|
2 |
+
from scipy.sparse import csr_matrix
|
|
|
3 |
|
4 |
+
"""
|
5 |
+
Function to calculate the multi project matching results
|
6 |
+
|
7 |
+
The Multi-Project Matching Feature uncovers synergy opportunities among various development banks and organizations by facilitating the search for similar projects
|
8 |
+
within a selected filter setting (filtered_df) and all projects (project_df).
|
9 |
+
"""
|
10 |
+
|
11 |
+
def calc_multi_matches(filtered_df, project_df, similarity_matrix, top_x):
|
12 |
+
"""
|
13 |
+
filtered_df: df with applied filters
|
14 |
+
project_df: df with all projects
|
15 |
+
similarity_matrix: np sparse matrix with all similarities between projects
|
16 |
+
top_x: top x project which should be displayed
|
17 |
+
"""
|
18 |
+
|
19 |
+
# convert npz sparse matrix into csr matrix
|
20 |
if not isinstance(similarity_matrix, csr_matrix):
|
21 |
similarity_matrix = csr_matrix(similarity_matrix)
|
22 |
|
23 |
+
# extract indecies of the projects
|
24 |
filtered_indices = filtered_df.index.to_list()
|
25 |
project_indices = project_df.index.to_list()
|
26 |
|
27 |
+
# size down the matrix to only projects within the filter and convert to dense matrix and flatten it
|
28 |
match_matrix = similarity_matrix[project_indices, :][:, filtered_indices] # row / column
|
|
|
29 |
dense_match_matrix = match_matrix.toarray()
|
|
|
30 |
flat_matrix = dense_match_matrix.flatten()
|
31 |
|
32 |
+
# get the indices of the top 15 values in the flattened matrix
|
33 |
top_15_indices = np.argsort(flat_matrix)[-top_x:]
|
34 |
|
35 |
# Convert flat indices back to 2D indices
|
|
|
39 |
top_15_values = flat_matrix[top_15_indices]
|
40 |
|
41 |
# Prepare the result with row and column indices from original dataframes
|
|
|
42 |
org_rows = []
|
43 |
org_cols = []
|
44 |
for value, row, col in zip(top_15_values, top_15_2d_indices[0], top_15_2d_indices[1]):
|
|
|
46 |
original_col_index = filtered_indices[col]
|
47 |
org_rows.append(original_row_index)
|
48 |
org_cols.append(original_col_index)
|
|
|
49 |
|
50 |
|
51 |
+
# create two result dataframes
|
52 |
+
|
53 |
+
"""
|
54 |
+
p1_df: first results of match
|
55 |
+
p2_df: matching result
|
56 |
+
|
57 |
+
matches are displayed through the indecies od p1 and p2 dfs
|
58 |
+
|
59 |
+
match1 p1_df.iloc[0] & p2_df.iloc[0]
|
60 |
+
match2 p1_df.iloc[1] & p2_df.iloc[1]
|
61 |
+
"""
|
62 |
p1_df = filtered_df.loc[org_cols].copy()
|
63 |
p1_df['similarity'] = top_15_values
|
64 |
|
65 |
p2_df = project_df.loc[org_rows].copy()
|
66 |
p2_df['similarity'] = top_15_values
|
|
|
67 |
|
68 |
+
# return both results df with amtching projects
|
69 |
return p1_df, p2_df
|
similarity_page.py
CHANGED
@@ -14,7 +14,7 @@ from modules.multimatch_result_table import show_multi_table
|
|
14 |
from modules.singlematch_result_table import show_single_table
|
15 |
from functions.filter_projects import filter_projects
|
16 |
from functions.filter_single import filter_single
|
17 |
-
from functions.
|
18 |
from functions.same_country_filter import same_country_filter
|
19 |
from functions.single_similar import find_similar
|
20 |
#import psutil
|
@@ -30,29 +30,14 @@ def get_process_memory():
|
|
30 |
# Catch DATA
|
31 |
|
32 |
# Load Similarity matrix
|
33 |
-
"""
|
34 |
-
@st.cache_data
|
35 |
-
def load_sim_matrix():
|
36 |
-
loaded_matrix = load_npz("src/extended_similarities.npz")
|
37 |
-
dense_matrix = loaded_matrix.toarray().astype('float16')
|
38 |
-
|
39 |
-
return dense_matrix
|
40 |
-
"""
|
41 |
@st.cache_data
|
42 |
def load_sim_matrix():
|
43 |
loaded_matrix = load_npz("src/extended_similarities.npz")
|
44 |
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
45 |
|
46 |
return loaded_matrix
|
47 |
-
# Load Non Similar Orga Matrix
|
48 |
-
"""
|
49 |
-
@st.cache_data
|
50 |
-
def load_nonsameorga_sim_matrix():
|
51 |
-
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
52 |
-
dense_matrix = loaded_matrix.toarray().astype('float16')
|
53 |
|
54 |
-
|
55 |
-
"""
|
56 |
def load_nonsameorga_sim_matrix():
|
57 |
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
58 |
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
@@ -272,10 +257,10 @@ def show_multi_matching_page():
|
|
272 |
## if show only different orgas checkbox is activated
|
273 |
if different_orga_checkbox:
|
274 |
with st.spinner('Please wait...'):
|
275 |
-
p1_df, p2_df =
|
276 |
else:
|
277 |
with st.spinner('Please wait...'):
|
278 |
-
p1_df, p2_df =
|
279 |
|
280 |
# SHOW THE RESULT
|
281 |
show_multi_table(p1_df, p2_df)
|
|
|
14 |
from modules.singlematch_result_table import show_single_table
|
15 |
from functions.filter_projects import filter_projects
|
16 |
from functions.filter_single import filter_single
|
17 |
+
from functions.multi_project_matching import calc_multi_matches
|
18 |
from functions.same_country_filter import same_country_filter
|
19 |
from functions.single_similar import find_similar
|
20 |
#import psutil
|
|
|
30 |
# Catch DATA
|
31 |
|
32 |
# Load Similarity matrix
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
@st.cache_data
|
34 |
def load_sim_matrix():
|
35 |
loaded_matrix = load_npz("src/extended_similarities.npz")
|
36 |
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
37 |
|
38 |
return loaded_matrix
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
# Load Non Similar Orga Matrix
|
|
|
41 |
def load_nonsameorga_sim_matrix():
|
42 |
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
43 |
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
|
|
257 |
## if show only different orgas checkbox is activated
|
258 |
if different_orga_checkbox:
|
259 |
with st.spinner('Please wait...'):
|
260 |
+
p1_df, p2_df = calc_multi_matches(filtered_df, compare_df, nonsameorgas_sim_matrix, TOP_X_PROJECTS)
|
261 |
else:
|
262 |
with st.spinner('Please wait...'):
|
263 |
+
p1_df, p2_df = calc_multi_matches(filtered_df, compare_df, sim_matrix, TOP_X_PROJECTS)
|
264 |
|
265 |
# SHOW THE RESULT
|
266 |
show_multi_table(p1_df, p2_df)
|