File size: 3,509 Bytes
f3a1940
 
5f41368
e4ce8fe
f3a1940
e4ce8fe
f17e764
e4ce8fe
 
 
 
f17e764
3d9250a
 
f17e764
 
 
 
 
5ca912a
127b334
5f41368
127b334
5ca912a
f17e764
a88ee3f
f17e764
5f41368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dda6e4a
 
f17e764
 
a88ee3f
 
 
f17e764
 
a88ee3f
f17e764
 
a88ee3f
f17e764
5f41368
3d9250a
 
5f41368
3d9250a
 
5f41368
 
 
e54232c
5f41368
 
 
 
e54232c
5f41368
a88ee3f
3137797
 
 
3d9250a
 
5f41368
3d9250a
5ca912a
3137797
a46e9cf
3137797
da4441d
 
3137797
2baee55
e0bcbc8
f17e764
da4441d
f17e764
d7f99ce
da4441d
f17e764
6cad12f
f3a1940
6cad12f
f3a1940
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import streamlit as st

# multi_project_matching
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
    st.write(filtered_df.shape)
    st.write(project_df.shape)
    st.write(similarity_matrix.shape)

    # Ensure the matrix is in a suitable format for manipulation
    if not isinstance(similarity_matrix, csr_matrix):
        similarity_matrix = csr_matrix(similarity_matrix)

    # Get indices from dataframes
    filtered_df_indices = filtered_df.index.to_list()
    project_df_indices = project_df.index.to_list()

    # Create mapping dictionaries
    filtered_df_index_map = {i: index for i, index in enumerate(filtered_df_indices)}
    st.write(filtered_df_index_map)
    project_df_index_map = {i: index for i, index in enumerate(project_df_indices)}

    # Select submatrix based on indices from both dataframes
    match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]

    coo = match_matrix.tocoo()

    data = coo.data
    row_indices = coo.row
    col_indices = coo.col

    top_n = 15
    if len(data) < top_n:
        top_n = len(data) 
    top_n_indices = np.argsort(data)[-top_n:][::-1]

    top_n_percentages = data[top_n_indices]
    top_n_row_indices = row_indices[top_n_indices]
    top_n_col_indices = col_indices[top_n_indices]

    original_row_indices = filtered_df_indices[top_n_row_indices]
    original_col_indices = project_df_indices[top_n_col_indices]

    st.write(match_matrix.shape)

    # Get the linear indices of the top 'top_x' values
    # (flattened index to handle the sparse matrix more effectively)
    #linear_indices = np.argsort(match_matrix.data)[-top_x:]
    #if len(linear_indices) < top_x:
    #    top_x = len(linear_indices)

    # Convert flat indices to 2D indices using the shape of the submatrix
    #top_indices = np.unravel_index(linear_indices, match_matrix.shape)

    # Get the corresponding similarity values
    #top_values = match_matrix.data[linear_indices]

    #flat_data = match_matrix.data

    # Get the indices that would sort the data array in descending order
    #sorted_indices = np.argsort(flat_data)[::-1]

    # Take the first k indices to get the top k maximum values
    #top_indices = sorted_indices[:top_x]
    #top_row_indices = []
    #top_col_indices = []

    #for idx in top_indices:
    #    row, col = np.unravel_index(idx, match_matrix.shape)
    #    top_row_indices.append(row)
    #    top_col_indices.append(col)

    #st.write(top_col_indices)
    # Convert flat indices to 2D row and column indices
    #row_indices, col_indices = match_matrix.nonzero()
    #row_indices = row_indices[top_indices]
    #col_indices = col_indices[top_indices]

    # Get the values corresponding to the top k indices
    #top_values = flat_data[top_indices]


    # Get the values corresponding to the top k indices
    #top_values = match_matrix[row_indices, col_indices]

    #top_filtered_df_indices = [filtered_df_index_map[i] for i in top_col_indices]
    #top_project_df_indices = [project_df_index_map[i] for i in top_row_indices]

    #st.write(top_filtered_df_indices)

    # Create resulting dataframes with top matches and their similarity scores
    p1_df = filtered_df.loc[top_col_indices].copy()
    p1_df['similarity'] = top_values

    p2_df = project_df.loc[top_row_indices].copy()
    p2_df['similarity'] = top_values
    print("finished calc matches")

    return p1_df, p2_df