Spaces:

awacke1
/

CodeCompetitionClaudeVsGPT

Running

App Files Files Community

awacke1 commited on Dec 19, 2024

Commit

193042f

verified ·

1 Parent(s): 9b95cb7

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -21

app.py CHANGED Viewed

@@ -68,6 +68,9 @@ class FastDatasetSearcher:
             st.error("Please set the DATASET_KEY environment variable with your Hugging Face token.")
             st.stop()
         # Load dataset info if not already loaded
         if st.session_state['dataset_info'] is None:
             st.session_state['dataset_info'] = get_dataset_info(self.dataset_id, self.token)
@@ -81,29 +84,48 @@ class FastDatasetSearcher:
         if df.empty:
             return df
-        scores = []
-        query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
-        for _, row in df.iterrows():
-            # Combine all searchable text fields
-            text_values = []
-            for v in row.values():
-                if isinstance(v, (str, int, float)):
-                    text_values.append(str(v))
-                elif isinstance(v, (list, dict)):
-                    text_values.append(str(v))
-            text = ' '.join(text_values)
-            # Quick keyword match
-            keyword_score = text.lower().count(query.lower()) / (len(text.split()) + 1)  # Add 1 to avoid division by zero
-            # Semantic search on combined text
-            text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
-            semantic_score = cosine_similarity([query_embedding], [text_embedding])[0][0]
-            # Combine scores
-            combined_score = 0.5 * semantic_score + 0.5 * keyword_score
-            scores.append(combined_score)
         # Get top results
         results_df = df.copy()

             st.error("Please set the DATASET_KEY environment variable with your Hugging Face token.")
             st.stop()
+        # Initialize numpy for model inputs
+        self.np = np
         # Load dataset info if not already loaded
         if st.session_state['dataset_info'] is None:
             st.session_state['dataset_info'] = get_dataset_info(self.dataset_id, self.token)
         if df.empty:
             return df
+        try:
+            # Get columns to search (excluding numpy array columns)
+            searchable_cols = []
+            for col in df.columns:
+                sample_val = df[col].iloc[0]
+                if not isinstance(sample_val, (np.ndarray, bytes)):
+                    searchable_cols.append(col)
+            # Prepare query
+            query_lower = query.lower()
+            query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
+            scores = []
+            # Process each row
+            for _, row in df.iterrows():
+                # Combine text from searchable columns
+                text_parts = []
+                for col in searchable_cols:
+                    val = row[col]
+                    if val is not None:
+                        if isinstance(val, (list, dict)):
+                            text_parts.append(str(val))
+                        else:
+                            text_parts.append(str(val))
+                text = ' '.join(text_parts)
+                # Calculate scores
+                if text.strip():
+                    # Keyword matching
+                    keyword_score = text.lower().count(query_lower) / max(len(text.split()), 1)
+                    # Semantic matching
+                    text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
+                    semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
+                    # Combine scores
+                    combined_score = 0.5 * semantic_score + 0.5 * keyword_score
+                else:
+                    combined_score = 0.0
+                scores.append(combined_score)
         # Get top results
         results_df = df.copy()