File size: 2,367 Bytes
f3a1940
 
f17e764
e4ce8fe
f3a1940
e4ce8fe
f17e764
e4ce8fe
 
 
 
f17e764
eaf56c5
 
f17e764
 
 
 
 
5ca912a
127b334
 
5ca912a
f17e764
a88ee3f
f17e764
dda6e4a
 
f17e764
 
a88ee3f
 
 
f17e764
 
a88ee3f
f17e764
 
a88ee3f
f17e764
a88ee3f
 
 
 
 
 
 
 
 
 
5ca912a
e0bcbc8
 
f17e764
5ca912a
f17e764
d7f99ce
5ca912a
f17e764
6cad12f
f3a1940
6cad12f
f3a1940
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
import streamlit as st

# multi_project_matching
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
    st.write(filtered_df.shape)
    st.write(project_df.shape)
    st.write(similarity_matrix.shape)

    # Ensure the matrix is in a suitable format for manipulation
    #if not isinstance(similarity_matrix, csr_matrix):
    #    similarity_matrix = csr_matrix(similarity_matrix)

    # Get indices from dataframes
    filtered_df_indices = filtered_df.index.to_list()
    project_df_indices = project_df.index.to_list()

    # Create mapping dictionaries
    filtered_df_index_map = {i: index for i, index in enumerate(filtered_df_indices)}
    project_df_index_map = {i: index for i, index in enumerate(project_df_indices)}

    # Select submatrix based on indices from both dataframes
    match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]

    st.write(match_matrix.shape)

    # Get the linear indices of the top 'top_x' values
    # (flattened index to handle the sparse matrix more effectively)
    #linear_indices = np.argsort(match_matrix.data)[-top_x:]
    #if len(linear_indices) < top_x:
    #    top_x = len(linear_indices)

    # Convert flat indices to 2D indices using the shape of the submatrix
    #top_indices = np.unravel_index(linear_indices, match_matrix.shape)

    # Get the corresponding similarity values
    #top_values = match_matrix.data[linear_indices]

    flat_indices = np.argpartition(match_matrix.flatten(), -3)[-3:]

    # Convert flat indices to 2D row and column indices
    row_indices, col_indices = np.unravel_index(flat_indices, match_matrix.shape)

    # Get the values corresponding to the top k indices
    top_values = match_matrix[row_indices, col_indices]

    top_filtered_df_indices = [filtered_df_index_map[i] for i in col_indices]
    top_project_df_indices = [project_df_index_map[i] for i in row_indices]

    st.write(top_filtered_df_indices)

    # Create resulting dataframes with top matches and their similarity scores
    p1_df = filtered_df.loc[top_filtered_df_indices].copy()
    p1_df['similarity'] = top_values

    p2_df = project_df.loc[top_project_df_indices].copy()
    p2_df['similarity'] = top_values
    print("finished calc matches")

    return p1_df, p2_df