File size: 1,721 Bytes
0ef6d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import numpy as np
from scipy.sparse import csr_matrix

"""
Function to find similar project for the single project matching

Single Project Matching empowers you to choose an individual project using 
either the project IATI ID or title, and then unveils the top x projects within a filter (filtered_df) that 
bear the closest resemblance to your selected one (p_index).
"""

def find_similar(p_index, similarity_matrix, filtered_df, top_x):
    """
    p_index: index of selected project
    similarity_matrix: matrix with similarities of all projects
    filtered_df: df with filter applied
    top_x: top x project which should be displayed
    """

    # convert npz sparse matrix into csr matrix
    if not isinstance(similarity_matrix, csr_matrix):
        similarity_matrix = csr_matrix(similarity_matrix)
    
    # filter out just projects from filtered_df
    filtered_indices =  filtered_df.index.tolist()
    filtered_column_sim_matrix = similarity_matrix[:, filtered_indices]

    # create a mapping from new position to original indices
    index_position_mapping = {position: index for position, index in enumerate(filtered_indices)}

    # select just the row of th similarity matrix of the selected project index
    project_row = filtered_column_sim_matrix.getrow(p_index).toarray().ravel()

    # find top_x indices with the highest similarity scores in the row
    sorted_indices = np.argsort(project_row)[-top_x:][::-1]
    top_indices = [index_position_mapping[i] for i in sorted_indices]
    top_values = project_row[sorted_indices]

    # create result df with all top_x similar projects
    result_df = filtered_df.loc[top_indices]
    result_df['similarity'] = top_values

    return result_df