Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,509 Bytes
f3a1940 5f41368 e4ce8fe f3a1940 e4ce8fe f17e764 e4ce8fe f17e764 3d9250a f17e764 5ca912a 127b334 5f41368 127b334 5ca912a f17e764 a88ee3f f17e764 5f41368 dda6e4a f17e764 a88ee3f f17e764 a88ee3f f17e764 a88ee3f f17e764 5f41368 3d9250a 5f41368 3d9250a 5f41368 e54232c 5f41368 e54232c 5f41368 a88ee3f 3137797 3d9250a 5f41368 3d9250a 5ca912a 3137797 a46e9cf 3137797 da4441d 3137797 2baee55 e0bcbc8 f17e764 da4441d f17e764 d7f99ce da4441d f17e764 6cad12f f3a1940 6cad12f f3a1940 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, coo_matrix
import streamlit as st
# multi_project_matching
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
st.write(filtered_df.shape)
st.write(project_df.shape)
st.write(similarity_matrix.shape)
# Ensure the matrix is in a suitable format for manipulation
if not isinstance(similarity_matrix, csr_matrix):
similarity_matrix = csr_matrix(similarity_matrix)
# Get indices from dataframes
filtered_df_indices = filtered_df.index.to_list()
project_df_indices = project_df.index.to_list()
# Create mapping dictionaries
filtered_df_index_map = {i: index for i, index in enumerate(filtered_df_indices)}
st.write(filtered_df_index_map)
project_df_index_map = {i: index for i, index in enumerate(project_df_indices)}
# Select submatrix based on indices from both dataframes
match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]
coo = match_matrix.tocoo()
data = coo.data
row_indices = coo.row
col_indices = coo.col
top_n = 15
if len(data) < top_n:
top_n = len(data)
top_n_indices = np.argsort(data)[-top_n:][::-1]
top_n_percentages = data[top_n_indices]
top_n_row_indices = row_indices[top_n_indices]
top_n_col_indices = col_indices[top_n_indices]
original_row_indices = filtered_df_indices[top_n_row_indices]
original_col_indices = project_df_indices[top_n_col_indices]
st.write(match_matrix.shape)
# Get the linear indices of the top 'top_x' values
# (flattened index to handle the sparse matrix more effectively)
#linear_indices = np.argsort(match_matrix.data)[-top_x:]
#if len(linear_indices) < top_x:
# top_x = len(linear_indices)
# Convert flat indices to 2D indices using the shape of the submatrix
#top_indices = np.unravel_index(linear_indices, match_matrix.shape)
# Get the corresponding similarity values
#top_values = match_matrix.data[linear_indices]
#flat_data = match_matrix.data
# Get the indices that would sort the data array in descending order
#sorted_indices = np.argsort(flat_data)[::-1]
# Take the first k indices to get the top k maximum values
#top_indices = sorted_indices[:top_x]
#top_row_indices = []
#top_col_indices = []
#for idx in top_indices:
# row, col = np.unravel_index(idx, match_matrix.shape)
# top_row_indices.append(row)
# top_col_indices.append(col)
#st.write(top_col_indices)
# Convert flat indices to 2D row and column indices
#row_indices, col_indices = match_matrix.nonzero()
#row_indices = row_indices[top_indices]
#col_indices = col_indices[top_indices]
# Get the values corresponding to the top k indices
#top_values = flat_data[top_indices]
# Get the values corresponding to the top k indices
#top_values = match_matrix[row_indices, col_indices]
#top_filtered_df_indices = [filtered_df_index_map[i] for i in top_col_indices]
#top_project_df_indices = [project_df_index_map[i] for i in top_row_indices]
#st.write(top_filtered_df_indices)
# Create resulting dataframes with top matches and their similarity scores
p1_df = filtered_df.loc[top_col_indices].copy()
p1_df['similarity'] = top_values
p2_df = project_df.loc[top_row_indices].copy()
p2_df['similarity'] = top_values
print("finished calc matches")
return p1_df, p2_df
|