File size: 2,367 Bytes
f3a1940 f17e764 e4ce8fe f3a1940 e4ce8fe f17e764 e4ce8fe f17e764 eaf56c5 f17e764 5ca912a 127b334 5ca912a f17e764 a88ee3f f17e764 dda6e4a f17e764 a88ee3f f17e764 a88ee3f f17e764 a88ee3f f17e764 a88ee3f 5ca912a e0bcbc8 f17e764 5ca912a f17e764 d7f99ce 5ca912a f17e764 6cad12f f3a1940 6cad12f f3a1940 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix
import streamlit as st
# multi_project_matching
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
st.write(filtered_df.shape)
st.write(project_df.shape)
st.write(similarity_matrix.shape)
# Ensure the matrix is in a suitable format for manipulation
#if not isinstance(similarity_matrix, csr_matrix):
# similarity_matrix = csr_matrix(similarity_matrix)
# Get indices from dataframes
filtered_df_indices = filtered_df.index.to_list()
project_df_indices = project_df.index.to_list()
# Create mapping dictionaries
filtered_df_index_map = {i: index for i, index in enumerate(filtered_df_indices)}
project_df_index_map = {i: index for i, index in enumerate(project_df_indices)}
# Select submatrix based on indices from both dataframes
match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]
st.write(match_matrix.shape)
# Get the linear indices of the top 'top_x' values
# (flattened index to handle the sparse matrix more effectively)
#linear_indices = np.argsort(match_matrix.data)[-top_x:]
#if len(linear_indices) < top_x:
# top_x = len(linear_indices)
# Convert flat indices to 2D indices using the shape of the submatrix
#top_indices = np.unravel_index(linear_indices, match_matrix.shape)
# Get the corresponding similarity values
#top_values = match_matrix.data[linear_indices]
flat_indices = np.argpartition(match_matrix.flatten(), -3)[-3:]
# Convert flat indices to 2D row and column indices
row_indices, col_indices = np.unravel_index(flat_indices, match_matrix.shape)
# Get the values corresponding to the top k indices
top_values = match_matrix[row_indices, col_indices]
top_filtered_df_indices = [filtered_df_index_map[i] for i in col_indices]
top_project_df_indices = [project_df_index_map[i] for i in row_indices]
st.write(top_filtered_df_indices)
# Create resulting dataframes with top matches and their similarity scores
p1_df = filtered_df.loc[top_filtered_df_indices].copy()
p1_df['similarity'] = top_values
p2_df = project_df.loc[top_project_df_indices].copy()
p2_df['similarity'] = top_values
print("finished calc matches")
return p1_df, p2_df
|