File size: 2,191 Bytes
f3a1940
 
f17e764
f3a1940
f17e764
f3a1940
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f17e764
 
 
 
 
 
 
 
f3a1940
f17e764
 
 
 
 
f3a1940
f17e764
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

"""
def find_similar(p_index, similarity_matrix, filtered_df, top_x):

    # filter out just projects from filtered df
    filtered_indices = filtered_df.index.tolist()

    index_position_mapping = {position: index for position, index in enumerate(filtered_indices)}

    filtered_column_sim_matrix = similarity_matrix[:, filtered_indices]

    # filter out the row of the selected poject
    project_row = filtered_column_sim_matrix[p_index]
    sorted_indices = np.argsort(project_row)
    top_10_indices_descending = sorted_indices[-10:][::-1]
    #top_10_original_indices = [index_position_mapping[position] for position in top_10_indices_descending]
    top_10_values_descending = project_row[top_10_indices_descending]

    result_df = filtered_df.iloc[top_10_indices_descending]
    result_df["similarity"] = top_10_values_descending

    return result_df
"""
def find_similar(p_index, similarity_matrix, filtered_df, top_x):
    # Ensure the similarity_matrix is in a suitable sparse format like CSR
    if not isinstance(similarity_matrix, csr_matrix):
        similarity_matrix = csr_matrix(similarity_matrix)
    
    # Filter out just projects from filtered_df
    filtered_indices = filtered_df.index.tolist()

    # Create a mapping from new position to original indices
    index_position_mapping = {position: index for position, index in enumerate(filtered_indices)}

    # Extract the submatrix corresponding to the filtered indices
    filtered_column_sim_matrix = similarity_matrix[:, filtered_indices]

    # Extract the row for the selected project efficiently
    # Convert the sparse row slice to a dense array for argsort function
    project_row = filtered_column_sim_matrix.getrow(p_index).toarray().ravel()

    # Find top_x indices with the highest similarity scores
    sorted_indices = np.argsort(project_row)[-top_x:][::-1]
    top_indices = [index_position_mapping[i] for i in sorted_indices]
    top_values = project_row[sorted_indices]

    # Prepare the result DataFrame
    result_df = filtered_df.loc[top_indices]
    result_df['similarity'] = top_values

    return result_df