Jan Mühlnikel commited on
Commit
5f41368
·
1 Parent(s): 2baee55

experiment

Browse files
Files changed (1) hide show
  1. functions/calc_matches.py +31 -12
functions/calc_matches.py CHANGED
@@ -1,6 +1,6 @@
1
  import pandas as pd
2
  import numpy as np
3
- from scipy.sparse import csr_matrix, lil_matrix
4
  import streamlit as st
5
 
6
  # multi_project_matching
@@ -19,11 +19,30 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
19
 
20
  # Create mapping dictionaries
21
  filtered_df_index_map = {i: index for i, index in enumerate(filtered_df_indices)}
 
22
  project_df_index_map = {i: index for i, index in enumerate(project_df_indices)}
23
 
24
  # Select submatrix based on indices from both dataframes
25
  match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  st.write(match_matrix.shape)
28
 
29
  # Get the linear indices of the top 'top_x' values
@@ -38,29 +57,29 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
38
  # Get the corresponding similarity values
39
  #top_values = match_matrix.data[linear_indices]
40
 
41
- flat_data = match_matrix.data
42
 
43
  # Get the indices that would sort the data array in descending order
44
- sorted_indices = np.argsort(flat_data)[::-1]
45
 
46
  # Take the first k indices to get the top k maximum values
47
- top_indices = sorted_indices[:top_x]
48
- top_row_indices = []
49
- top_col_indices = []
50
 
51
- for idx in top_indices:
52
- row, col = np.unravel_index(idx, match_matrix.shape)
53
- top_row_indices.append(row)
54
- top_col_indices.append(col)
55
 
56
- st.write(top_col_indices)
57
  # Convert flat indices to 2D row and column indices
58
  #row_indices, col_indices = match_matrix.nonzero()
59
  #row_indices = row_indices[top_indices]
60
  #col_indices = col_indices[top_indices]
61
 
62
  # Get the values corresponding to the top k indices
63
- top_values = flat_data[top_indices]
64
 
65
 
66
  # Get the values corresponding to the top k indices
 
1
  import pandas as pd
2
  import numpy as np
3
+ from scipy.sparse import csr_matrix, coo_matrix
4
  import streamlit as st
5
 
6
  # multi_project_matching
 
19
 
20
  # Create mapping dictionaries
21
  filtered_df_index_map = {i: index for i, index in enumerate(filtered_df_indices)}
22
+ st.write(filtered_df_index_map)
23
  project_df_index_map = {i: index for i, index in enumerate(project_df_indices)}
24
 
25
  # Select submatrix based on indices from both dataframes
26
  match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]
27
 
28
+ coo = match_matrix.tocoo()
29
+
30
+ data = coo.data
31
+ row_indices = coo.row
32
+ col_indices = coo.col
33
+
34
+ top_n = 15
35
+ if len(data) < top_n:
36
+ top_n = len(data)
37
+ top_n_indices = np.argsort(data)[-top_n:][::-1]
38
+
39
+ top_n_percentages = data[top_n_indices]
40
+ top_n_row_indices = row_indices[top_n_indices]
41
+ top_n_col_indices = col_indices[top_n_indices]
42
+
43
+ original_row_indices = filtered_df_indices[top_n_row_indices]
44
+ original_col_indices = project_df_indices[top_n_col_indices]
45
+
46
  st.write(match_matrix.shape)
47
 
48
  # Get the linear indices of the top 'top_x' values
 
57
  # Get the corresponding similarity values
58
  #top_values = match_matrix.data[linear_indices]
59
 
60
+ #flat_data = match_matrix.data
61
 
62
  # Get the indices that would sort the data array in descending order
63
+ #sorted_indices = np.argsort(flat_data)[::-1]
64
 
65
  # Take the first k indices to get the top k maximum values
66
+ #top_indices = sorted_indices[:top_x]
67
+ #top_row_indices = []
68
+ #top_col_indices = []
69
 
70
+ #for idx in top_indices:
71
+ # row, col = np.unravel_index(idx, match_matrix.shape)
72
+ # top_row_indices.append(row)
73
+ # top_col_indices.append(col)
74
 
75
+ #st.write(top_col_indices)
76
  # Convert flat indices to 2D row and column indices
77
  #row_indices, col_indices = match_matrix.nonzero()
78
  #row_indices = row_indices[top_indices]
79
  #col_indices = col_indices[top_indices]
80
 
81
  # Get the values corresponding to the top k indices
82
+ #top_values = flat_data[top_indices]
83
 
84
 
85
  # Get the values corresponding to the top k indices