awacke1 commited on
Commit
193042f
·
verified ·
1 Parent(s): 9b95cb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -21
app.py CHANGED
@@ -68,6 +68,9 @@ class FastDatasetSearcher:
68
  st.error("Please set the DATASET_KEY environment variable with your Hugging Face token.")
69
  st.stop()
70
 
 
 
 
71
  # Load dataset info if not already loaded
72
  if st.session_state['dataset_info'] is None:
73
  st.session_state['dataset_info'] = get_dataset_info(self.dataset_id, self.token)
@@ -81,29 +84,48 @@ class FastDatasetSearcher:
81
  if df.empty:
82
  return df
83
 
84
- scores = []
85
- query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
86
-
87
- for _, row in df.iterrows():
88
- # Combine all searchable text fields
89
- text_values = []
90
- for v in row.values():
91
- if isinstance(v, (str, int, float)):
92
- text_values.append(str(v))
93
- elif isinstance(v, (list, dict)):
94
- text_values.append(str(v))
95
- text = ' '.join(text_values)
96
-
97
- # Quick keyword match
98
- keyword_score = text.lower().count(query.lower()) / (len(text.split()) + 1) # Add 1 to avoid division by zero
99
 
100
- # Semantic search on combined text
101
- text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
102
- semantic_score = cosine_similarity([query_embedding], [text_embedding])[0][0]
 
103
 
104
- # Combine scores
105
- combined_score = 0.5 * semantic_score + 0.5 * keyword_score
106
- scores.append(combined_score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Get top results
109
  results_df = df.copy()
 
68
  st.error("Please set the DATASET_KEY environment variable with your Hugging Face token.")
69
  st.stop()
70
 
71
+ # Initialize numpy for model inputs
72
+ self.np = np
73
+
74
  # Load dataset info if not already loaded
75
  if st.session_state['dataset_info'] is None:
76
  st.session_state['dataset_info'] = get_dataset_info(self.dataset_id, self.token)
 
84
  if df.empty:
85
  return df
86
 
87
+ try:
88
+ # Get columns to search (excluding numpy array columns)
89
+ searchable_cols = []
90
+ for col in df.columns:
91
+ sample_val = df[col].iloc[0]
92
+ if not isinstance(sample_val, (np.ndarray, bytes)):
93
+ searchable_cols.append(col)
 
 
 
 
 
 
 
 
94
 
95
+ # Prepare query
96
+ query_lower = query.lower()
97
+ query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
98
+ scores = []
99
 
100
+ # Process each row
101
+ for _, row in df.iterrows():
102
+ # Combine text from searchable columns
103
+ text_parts = []
104
+ for col in searchable_cols:
105
+ val = row[col]
106
+ if val is not None:
107
+ if isinstance(val, (list, dict)):
108
+ text_parts.append(str(val))
109
+ else:
110
+ text_parts.append(str(val))
111
+
112
+ text = ' '.join(text_parts)
113
+
114
+ # Calculate scores
115
+ if text.strip():
116
+ # Keyword matching
117
+ keyword_score = text.lower().count(query_lower) / max(len(text.split()), 1)
118
+
119
+ # Semantic matching
120
+ text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
121
+ semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
122
+
123
+ # Combine scores
124
+ combined_score = 0.5 * semantic_score + 0.5 * keyword_score
125
+ else:
126
+ combined_score = 0.0
127
+
128
+ scores.append(combined_score)
129
 
130
  # Get top results
131
  results_df = df.copy()