File size: 3,159 Bytes
f3a1940
 
f17e764
e4ce8fe
f3a1940
29fd9ee
f3a1940
1081227
f3a1940
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1081227
29fd9ee
e4ce8fe
f17e764
7529755
e4ce8fe
 
 
 
f17e764
b8028b3
 
f17e764
 
 
 
 
5ca912a
127b334
 
5ca912a
f17e764
 
 
dda6e4a
 
f17e764
 
 
 
 
 
 
 
 
 
 
 
fa25391
 
5ca912a
e0bcbc8
 
f17e764
5ca912a
f17e764
d7f99ce
5ca912a
f17e764
6cad12f
f3a1940
6cad12f
f3a1940
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
import streamlit as st

"""
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
    # matching project2 can be any project
    # indecies (rows) = project1
    # columns = project2
    # -> find matches

    # filter out all row considering the filter
    filtered_df_indecies_list = filtered_df.index
    project_df_indecies_list = project_df.index

    np.fill_diagonal(similarity_matrix, 0)
    match_matrix = similarity_matrix[filtered_df_indecies_list, :][:, project_df_indecies_list]

    best_matches_list = np.argsort(match_matrix, axis=None)

    if len(best_matches_list) < top_x:
        top_x = len(best_matches_list)

    # get row (project1) and column (project2) with highest similarity in filtered df
    top_indices = np.unravel_index(best_matches_list[-top_x:], match_matrix.shape)

    # get the corresponding similarity values
    top_values = match_matrix[top_indices]

    p1_df = filtered_df.iloc[top_indices[0]]
    p1_df["similarity"] = top_values
    p2_df = project_df.iloc[top_indices[1]]
    p2_df["similarity"] = top_values

    return p1_df, p2_df
"""

# multi_project_matching
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
    st.dataframe(project_df.head(5))
    st.write(filtered_df.shape)
    st.write(project_df.shape)
    st.write(similarity_matrix.shape)

    # Ensure the matrix is in a suitable format for manipulation
    if not isinstance(similarity_matrix, csr_matrix):
        similarity_matrix = csr_matrix(similarity_matrix)

    # Get indices from dataframes
    filtered_df_indices = filtered_df.index.to_list()
    project_df_indices = project_df.index.to_list()

    # Create mapping dictionaries
    filtered_df_index_map = {i: index for i, index in enumerate(filtered_df_indices)}
    project_df_index_map = {i: index for i, index in enumerate(project_df_indices)}

    # Select submatrix based on indices from both dataframes
    match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]

    st.write(match_matrix.shape)

    # Get the linear indices of the top 'top_x' values
    # (flattened index to handle the sparse matrix more effectively)
    linear_indices = np.argsort(match_matrix.data)[-top_x:]
    if len(linear_indices) < top_x:
        top_x = len(linear_indices)

    # Convert flat indices to 2D indices using the shape of the submatrix
    top_indices = np.unravel_index(linear_indices, match_matrix.shape)

    # Get the corresponding similarity values
    top_values = match_matrix.data[linear_indices]

    top_filtered_df_indices = [filtered_df_index_map[i] for i in top_indices[0]]
    top_project_df_indices = [project_df_index_map[i] for i in top_indices[1]]

    st.write(top_filtered_df_indices)

    # Create resulting dataframes with top matches and their similarity scores
    p1_df = filtered_df.loc[top_filtered_df_indices].copy()
    p1_df['similarity'] = top_values

    p2_df = project_df.loc[top_project_df_indices].copy()
    p2_df['similarity'] = top_values
    print("finished calc matches")

    return p1_df, p2_df