Spaces:

KingZack
/

ctp-slack-bot

Runtime error

App Files Files Community

Hussam commited on Apr 18

Commit

6f98944

1 Parent(s): 3da2136

W.I.P: fixing vector index format and search pipeline

Browse files

Files changed (3) hide show

src/ctp_slack_bot/db/mongo_db.py +23 -17
src/ctp_slack_bot/services/content_ingestion_service.py +6 -6
src/ctp_slack_bot/services/vector_database_service.py +19 -11

src/ctp_slack_bot/db/mongo_db.py CHANGED Viewed

@@ -113,26 +113,32 @@ class MongoDB(BaseModel):
             try:
                 # Create the vector search index with the proper MongoDB format
                 vector_search_index = {
-                    "mappings": {
-                        "dynamic": True,
-                        "fields": {
-                            "embedding": {
-                                "type": "knnVector",
-                                "dimensions": self.settings.VECTOR_DIMENSION,
-                                "similarity": "cosine"
-                            }
                         }
-                    }
                 }
-                # Using createSearchIndex command which is the proper way to create vector search indexes
-                await self.db.command({
-                    "createSearchIndex": collection_name,
-                    "name": f"{collection_name}_vector_index",
-                    "definition": vector_search_index
-                })
-                logger.info("Created vector search index for collection {}", collection_name)
             except Exception as e:
                 logger.error("Failed to create vector index: {}", e)
                 raise

             try:
                 # Create the vector search index with the proper MongoDB format
                 vector_search_index = {
+                    "fields": [
+                        {
+                            "type": "vector",
+                            "path": "embedding",
+                            "numDimensions": self.settings.VECTOR_DIMENSION,
+                            "similarity": "cosine"
                         }
+                    ]
                 }
+                try:
+                    # Using createSearchIndex command which is the proper way to create vector search indexes
+                    await self.db.command({
+                        "createSearchIndex": collection_name,
+                        "name": f"{collection_name}_vector_index",
+                        "definition": vector_search_index
+                    })
+                    logger.info("Created vector search index for collection {}", collection_name)
+                except Exception as e:
+                    if "command not found" in str(e).lower():
+                        logger.warning("Vector search not supported by this MongoDB instance. Some functionality may be limited.")
+                        # Create a fallback standard index on embedding field
+                        await collection.create_index("embedding")
+                        logger.info("Created standard index on 'embedding' field as fallback")
+                    else:
+                        raise
             except Exception as e:
                 logger.error("Failed to create vector index: {}", e)
                 raise

src/ctp_slack_bot/services/content_ingestion_service.py CHANGED Viewed

@@ -34,16 +34,16 @@ class ContentIngestionService(BaseModel):
         #    logger.debug("Ignored content with ID {} because it already exists in the database.", content.id)
         #    return
         chunks = content.get_chunks()
-        self.__vectorize_and_store_chunks_in_database(chunks)
         logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
     async def process_incoming_slack_message(self: Self, slack_message: SlackMessage) -> None:
         logger.debug("Content ingestion service received a Slack message: {}", slack_message.text)
         chunks = slack_message.get_chunks()
-        self.__vectorize_and_store_chunks_in_database(chunks)
         logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
-    def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
-        # vectorized_chunks = self.vectorization_service.vectorize(chunks) # TODO
-        # self.vector_database_service.store(vectorized_chunks) # TODO
-        pass

         #    logger.debug("Ignored content with ID {} because it already exists in the database.", content.id)
         #    return
         chunks = content.get_chunks()
+        await self.__vectorize_and_store_chunks_in_database(chunks)
         logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
     async def process_incoming_slack_message(self: Self, slack_message: SlackMessage) -> None:
         logger.debug("Content ingestion service received a Slack message: {}", slack_message.text)
         chunks = slack_message.get_chunks()
+        await self.__vectorize_and_store_chunks_in_database(chunks)
         logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
+    async def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
+        vectorized_chunks = self.vectorization_service.vectorize(chunks) # TODO
+        await self.vector_database_service.store(vectorized_chunks) # TODO

src/ctp_slack_bot/services/vector_database_service.py CHANGED Viewed

@@ -87,16 +87,15 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
             # Get the vector collection
             vector_collection = await self.mongo_db.get_collection("vectors")
-            # Build aggregation pipeline for vector search
             pipeline = [
                 {
-                    "$search": {
                         "index": "vectors_vector_index",
-                        "knnBeta": {
-                            "vector": list(query.query_embeddings),
-                            "path": "embedding",
-                            "k": query.k
-                        }
                     }
                 },
                 {
@@ -105,7 +104,7 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
                         "metadata": 1,
                         "parent_id": 1,
                         "chunk_id": 1,
-                        "score": { "$meta": "searchScore" }
                     }
                 }
             ]
@@ -115,7 +114,7 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
                 metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
                 pipeline.insert(1, {"$match": metadata_filter})
-            # Add score threshold filter
             if query.score_threshold > 0:
                 pipeline.append({
                     "$match": {
@@ -123,8 +122,17 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
                     }
                 })
-            # Execute the pipeline
-            results = await vector_collection.aggregate(pipeline).to_list(length=query.k)
             # Convert results to Chunk objects
             chunks = []

             # Get the vector collection
             vector_collection = await self.mongo_db.get_collection("vectors")
+            # Build aggregation pipeline for vector search using official MongoDB format
             pipeline = [
                 {
+                    "$vectorSearch": {
                         "index": "vectors_vector_index",
+                        "queryVector": list(query.query_embeddings),
+                        "path": "embedding",
+                        "numCandidates": query.k * 10,
+                        "limit": query.k
                     }
                 },
                 {
                         "metadata": 1,
                         "parent_id": 1,
                         "chunk_id": 1,
+                        "score": { "$meta": "vectorSearchScore" }
                     }
                 }
             ]
                 metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
                 pipeline.insert(1, {"$match": metadata_filter})
+            # Add score threshold filter if needed
             if query.score_threshold > 0:
                 pipeline.append({
                     "$match": {
                     }
                 })
+            try:
+                # Execute the vector search pipeline
+                results = await vector_collection.aggregate(pipeline).to_list(length=query.k)
+            except Exception as e:
+                logger.warning(f"Vector search failed: {str(e)}. Falling back to basic text search.")
+                # Fall back to basic filtering with limit
+                query_filter = {}
+                if query.filter_metadata:
+                    query_filter.update({f"metadata.{k}": v for k, v in query.filter_metadata.items()})
+                results = await vector_collection.find(query_filter).limit(query.k).to_list(length=query.k)
             # Convert results to Chunk objects
             chunks = []