awacke1 commited on
Commit
28280be
·
verified ·
1 Parent(s): 959152c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -10
app.py CHANGED
@@ -47,10 +47,13 @@ class DatasetSearcher:
47
 
48
  # Store column information
49
  self.columns = list(self.df.columns)
50
- self.text_columns = [col for col in self.columns
51
- if self.df[col].dtype == 'object'
52
- and not any(term in col.lower()
53
- for term in ['embed', 'vector', 'encoding'])]
 
 
 
54
 
55
  # Update session state columns
56
  st.session_state['search_columns'] = self.text_columns
@@ -66,16 +69,32 @@ class DatasetSearcher:
66
  def prepare_features(self):
67
  """Prepare text embeddings for semantic search"""
68
  try:
69
- # Combine text columns for embedding
70
- combined_text = self.df[self.text_columns].fillna('').agg(' '.join, axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  # Create embeddings in batches to manage memory
73
  batch_size = 32
74
  all_embeddings = []
75
 
76
  with st.spinner("Preparing search features..."):
77
- for i in range(0, len(combined_text), batch_size):
78
- batch = combined_text[i:i+batch_size].tolist()
79
  embeddings = self.text_model.encode(batch)
80
  all_embeddings.append(embeddings)
81
 
@@ -98,10 +117,23 @@ class DatasetSearcher:
98
  search_columns = [column] if column and column != "All Fields" else self.text_columns
99
  keyword_scores = np.zeros(len(self.df))
100
 
 
101
  for col in search_columns:
102
  if col in self.df.columns:
103
- matches = self.df[col].fillna('').str.lower().str.count(query.lower())
104
- keyword_scores += matches
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  # Combine scores
107
  combined_scores = 0.5 * similarities + 0.5 * (keyword_scores / max(1, keyword_scores.max()))
 
47
 
48
  # Store column information
49
  self.columns = list(self.df.columns)
50
+ # Identify searchable columns
51
+ self.text_columns = []
52
+ for col in self.columns:
53
+ if col.lower() not in ['embed', 'vector', 'encoding']:
54
+ sample_val = self.df[col].iloc[0] if not self.df.empty else None
55
+ if isinstance(sample_val, (str, int, float, list, dict)) or sample_val is None:
56
+ self.text_columns.append(col)
57
 
58
  # Update session state columns
59
  st.session_state['search_columns'] = self.text_columns
 
69
  def prepare_features(self):
70
  """Prepare text embeddings for semantic search"""
71
  try:
72
+ # Process text columns and handle different data types
73
+ processed_texts = []
74
+ for _, row in self.df.iterrows():
75
+ row_texts = []
76
+ for col in self.text_columns:
77
+ value = row[col]
78
+ if isinstance(value, (list, dict)):
79
+ # Convert lists or dicts to string representation
80
+ row_texts.append(str(value))
81
+ elif isinstance(value, (int, float)):
82
+ # Convert numbers to strings
83
+ row_texts.append(str(value))
84
+ elif value is None:
85
+ row_texts.append('')
86
+ else:
87
+ # Handle string values
88
+ row_texts.append(str(value))
89
+ processed_texts.append(' '.join(row_texts))
90
 
91
  # Create embeddings in batches to manage memory
92
  batch_size = 32
93
  all_embeddings = []
94
 
95
  with st.spinner("Preparing search features..."):
96
+ for i in range(0, len(processed_texts), batch_size):
97
+ batch = processed_texts[i:i+batch_size]
98
  embeddings = self.text_model.encode(batch)
99
  all_embeddings.append(embeddings)
100
 
 
117
  search_columns = [column] if column and column != "All Fields" else self.text_columns
118
  keyword_scores = np.zeros(len(self.df))
119
 
120
+ query_lower = query.lower()
121
  for col in search_columns:
122
  if col in self.df.columns:
123
+ for idx, value in enumerate(self.df[col]):
124
+ if isinstance(value, (list, dict)):
125
+ # Search in string representation of lists or dicts
126
+ text = str(value).lower()
127
+ elif isinstance(value, (int, float)):
128
+ # Convert numbers to strings for searching
129
+ text = str(value).lower()
130
+ elif value is None:
131
+ text = ''
132
+ else:
133
+ # Handle string values
134
+ text = str(value).lower()
135
+
136
+ keyword_scores[idx] += text.count(query_lower)
137
 
138
  # Combine scores
139
  combined_scores = 0.5 * similarities + 0.5 * (keyword_scores / max(1, keyword_scores.max()))