File size: 2,774 Bytes
f3a1940
 
f17e764
e4ce8fe
f3a1940
29fd9ee
f3a1940
1081227
f3a1940
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1081227
29fd9ee
e4ce8fe
f17e764
e4ce8fe
 
 
 
f17e764
b8028b3
 
f17e764
 
 
 
 
 
 
 
dda6e4a
 
f17e764
 
 
 
 
 
 
 
 
 
 
 
 
d7f99ce
c8d1d9b
f17e764
d7f99ce
 
c8d1d9b
f17e764
d7f99ce
6cad12f
f3a1940
6cad12f
f3a1940
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
import streamlit as st

"""
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
    # matching project2 can be any project
    # indecies (rows) = project1
    # columns = project2
    # -> find matches

    # filter out all row considering the filter
    filtered_df_indecies_list = filtered_df.index
    project_df_indecies_list = project_df.index

    np.fill_diagonal(similarity_matrix, 0)
    match_matrix = similarity_matrix[filtered_df_indecies_list, :][:, project_df_indecies_list]

    best_matches_list = np.argsort(match_matrix, axis=None)

    if len(best_matches_list) < top_x:
        top_x = len(best_matches_list)

    # get row (project1) and column (project2) with highest similarity in filtered df
    top_indices = np.unravel_index(best_matches_list[-top_x:], match_matrix.shape)

    # get the corresponding similarity values
    top_values = match_matrix[top_indices]

    p1_df = filtered_df.iloc[top_indices[0]]
    p1_df["similarity"] = top_values
    p2_df = project_df.iloc[top_indices[1]]
    p2_df["similarity"] = top_values

    return p1_df, p2_df
"""

# multi_project_matching
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
    st.write(filtered_df.shape)
    st.write(project_df.shape)
    st.write(similarity_matrix.shape)

    # Ensure the matrix is in a suitable format for manipulation
    if not isinstance(similarity_matrix, csr_matrix):
        similarity_matrix = csr_matrix(similarity_matrix)

    # Get indices from dataframes
    filtered_df_indices = filtered_df.index.to_list()
    project_df_indices = project_df.index.to_list()

    # Select submatrix based on indices from both dataframes
    match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]

    st.write(match_matrix.shape)

    # Get the linear indices of the top 'top_x' values
    # (flattened index to handle the sparse matrix more effectively)
    linear_indices = np.argsort(match_matrix.data)[-top_x:]
    if len(linear_indices) < top_x:
        top_x = len(linear_indices)

    # Convert flat indices to 2D indices using the shape of the submatrix
    top_indices = np.unravel_index(linear_indices, match_matrix.shape)

    # Get the corresponding similarity values
    top_values = match_matrix.data[linear_indices]

    # Create resulting dataframes with top matches and their similarity scores
    st.write(top_indices)
    p1_df = filtered_df.loc[top_indices[0]].copy()
    p1_df['similarity'] = top_values

    st.dataframe(p1_df)
    p2_df = project_df.loc[top_indices[1]].copy()
    p2_df['similarity'] = top_values
    st.dataframe(p2_df)
    print("finished calc matches")

    return p1_df, p2_df