Spaces:

KingZack
/

ctp-slack-bot

Runtime error

App Files Files Community

Hussam commited on Apr 14

Commit

06d7b2d

1 Parent(s): 93f6882

revised mongo_db index creation, storing, and similarity search using the new Chunk model

Browse files

Files changed (3) hide show

src/ctp_slack_bot/db/mongo_db.py +50 -5
src/ctp_slack_bot/services/context_retrieval_service.py +1 -1
src/ctp_slack_bot/services/vector_database_service.py +75 -60

src/ctp_slack_bot/db/mongo_db.py CHANGED Viewed

@@ -83,16 +83,61 @@ class MongoDB(BaseModel):
             return False
     async def get_collection(self: Self, name: str) -> Any:
-        """Get a collection by name with validation."""
         if not await self.ping():
             raise ConnectionError("MongoDB connection is not available")
         return self.db[name]
-    async def create_indexes(self: Self, collection_name: str, indexes: list) -> None:
-        """Create indexes on a collection."""
         collection = await self.get_collection(collection_name)
-        await collection.create_indexes(indexes)
-        logger.info("Created indexes for collection {}: {}", collection_name, indexes)
     async def close(self: Self) -> None:
         """Close MongoDB connection."""

             return False
     async def get_collection(self: Self, name: str) -> Any:
+        """
+        Get a collection by name with validation.
+        Creates the collection if it doesn't exist.
+        """
         if not await self.ping():
             raise ConnectionError("MongoDB connection is not available")
+        # Get all collection names to check if this one exists
+        collection_names = await self.db.list_collection_names()
+        if name not in collection_names:
+            logger.info(f"Collection {name} does not exist. Creating it.")
+            # Create the collection
+            await self.db.create_collection(name)
         return self.db[name]
+    async def create_indexes(self: Self, collection_name: str, indexes: list = None) -> None:
+        """
+        Create indexes on a collection.
+        If no indexes provided and collection needs vector search capability,
+        creates a vector search index using config settings.
+        """
         collection = await self.get_collection(collection_name)
+        if indexes:
+            await collection.create_indexes(indexes)
+            logger.info("Created custom indexes for collection {}: {}", collection_name, indexes)
+        else: # Create vector search index using settings from config
+            try:
+                # Create the vector search index with the proper MongoDB format
+                vector_search_index = {
+                    "mappings": {
+                        "dynamic": True,
+                        "fields": {
+                            "embedding": {
+                                "type": "knnVector",
+                                "dimensions": self.settings.VECTOR_DIMENSION,
+                                "similarity": "cosine"
+                            }
+                        }
+                    }
+                }
+                # Using createSearchIndex command which is the proper way to create vector search indexes
+                await self.db.command({
+                    "createSearchIndex": collection_name,
+                    "name": f"{collection_name}_vector_index",
+                    "definition": vector_search_index
+                })
+                logger.info("Created vector search index for collection {}", collection_name)
+            except Exception as e:
+                logger.error("Failed to create vector index: {}", e)
+                raise
     async def close(self: Self) -> None:
         """Close MongoDB connection."""

src/ctp_slack_bot/services/context_retrieval_service.py CHANGED Viewed

@@ -22,7 +22,7 @@ class ContextRetrievalService(BaseModel):
         logger.debug("Created {}", self.__class__.__name__)
         return self
-    async def get_context(self, message: SlackMessage) -> Sequence[Chunk]:
         """
         Retrieve relevant context for a given SlackMessage by vectorizing the message and
         querying the vectorstore.

         logger.debug("Created {}", self.__class__.__name__)
         return self
+    async def get_context(self: Self, message: SlackMessage) -> Sequence[Chunk]:
         """
         Retrieve relevant context for a given SlackMessage by vectorizing the message and
         querying the vectorstore.

src/ctp_slack_bot/services/vector_database_service.py CHANGED Viewed

@@ -4,13 +4,12 @@ from typing import Any, Collection, Dict, List, Optional, Self, Sequence
 from ctp_slack_bot.core import Settings
 from ctp_slack_bot.db import MongoDB
-from ctp_slack_bot.models import Chunk, Content, VectorizedChunk, VectorQuery
 class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
     """
     Service for storing and retrieving vector embeddings from MongoDB.
     """
     settings: Settings
     mongo_db: MongoDB
@@ -18,69 +17,82 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
     def post_init(self: Self) -> Self:
         logger.debug("Created {}", self.__class__.__name__)
         return self
-    # Should not allow initialization calls to bubble up all the way to the surface ― sequester in `post_init` or the class on which it depends.
-    # async def initialize(self) -> None:
-    #     """
-    #     Initialize the database connection.
-    #     """
-    #     await self.mongo_db.initialize()
     # TODO: Weight cost of going all async.
-    async def store(self, chunks: Collection[VectorizedChunk]) -> None:
         """
-        Store text and its embedding vector in the database.
         Args:
-            text: The text content to store
-            embedding: The vector embedding of the text
-            metadata: Additional metadata about the text (source, timestamp, etc.)
-        Returns:
-            str: The ID of the stored document
         """
-        if not self.mongo_db.initialized:
-            await self.mongo_db.initialize()
         try:
-            # Create document to store
-            document = {
-                "text": text,
-                "embedding": embedding,
-                "metadata": metadata
-            }
-            # Insert into collection
-            result = await self.mongo_db.vector_collection.insert_one(document)
-            logger.debug(f"Stored document with ID: {result.inserted_id}")
-            return str(result.inserted_id)
         except Exception as e:
-            logger.error(f"Error storing embedding: {str(e)}")
             raise
-    async def search_by_similarity(self, query: VectorQuery) -> Sequence[Chunk]:
         """
         Query the vector database for similar documents.
         Args:
             query: VectorQuery object with search parameters
-            query_embedding: The vector embedding of the query text
         Returns:
-            List[RetreivedContext]: List of similar documents with similarity scores
         """
-        if not self.mongo_db.initialized:
-            await self.mongo_db.initialize()
         try:
             # Build aggregation pipeline for vector search
             pipeline = [
                 {
                     "$search": {
-                        "index": "vector_index",
                         "knnBeta": {
-                            "vector": query_embedding,
                             "path": "embedding",
                             "k": query.k
                         }
@@ -88,10 +100,11 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
                 },
                 {
                     "$project": {
-                        "_id": 0,
                         "text": 1,
                         "metadata": 1,
-                        "score": {"$meta": "searchScore"}
                     }
                 }
             ]
@@ -101,31 +114,33 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
                 metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
                 pipeline.insert(1, {"$match": metadata_filter})
             # Execute the pipeline
-            results = await self.mongo_db.vector_collection.aggregate(pipeline).to_list(length=query.k)
-            # Convert to RetreivedContext objects directly
-            context_results = []
             for result in results:
-                # Normalize score to [0,1] range
-                normalized_score = result.get("score", 0)
-                # Skip if below threshold
-                if normalized_score < query.score_threshold:
-                    continue
-                context_results.append(
-                    Content(
-                        contextual_text=result["text"],
-                        metadata_source=result["metadata"].get("source", "unknown"),
-                        similarity_score=normalized_score,
-                        said_by=result["metadata"].get("speaker", None),
-                        in_reation_to_question=result["metadata"].get("related_question", None)
-                    )
                 )
-            logger.debug(f"Found {len(context_results)} similar documents")
-            return context_results
         except Exception as e:
             logger.error(f"Error in similarity search: {str(e)}")

 from ctp_slack_bot.core import Settings
 from ctp_slack_bot.db import MongoDB
+from ctp_slack_bot.models import Chunk, VectorizedChunk, VectorQuery
 class VectorDatabaseService(BaseModel): # TODO: this should not rely specifically on MongoDB.
     """
     Service for storing and retrieving vector embeddings from MongoDB.
     """
     settings: Settings
     mongo_db: MongoDB
     def post_init(self: Self) -> Self:
         logger.debug("Created {}", self.__class__.__name__)
         return self
     # TODO: Weight cost of going all async.
+    async def store(self: Self, chunks: Collection[VectorizedChunk]) -> None:
         """
+        Stores vectorized chunks and their embedding vectors in the database.
         Args:
+            chunks: Collection of VectorizedChunk objects to store
+        Returns: None
         """
+        if not chunks:
+            logger.debug("No chunks to store")
+            return
         try:
+            # Get the vector collection - this will create it if it doesn't exist
+            vector_collection = await self.mongo_db.get_collection("vectors")
+            # Ensure vector search index exists
+            await self.mongo_db.create_indexes("vectors")
+            # Create documents to store, ensuring compatibility with BSON
+            documents = []
+            for chunk in chunks:
+                # Convert embedding to standard list format (important for BSON compatibility)
+                embedding = list(chunk.embedding) if not isinstance(chunk.embedding, list) else chunk.embedding
+                # Build document with proper structure
+                document = {
+                    "text": chunk.text,
+                    "embedding": embedding,
+                    "metadata": chunk.metadata,
+                    "parent_id": chunk.parent_id,
+                    "chunk_id": chunk.chunk_id
+                }
+                documents.append(document)
+            # Insert into collection as a batch
+            result = await vector_collection.insert_many(documents)
+            logger.info(f"Stored {len(result.inserted_ids)} vector chunks in database")
         except Exception as e:
+            logger.error(f"Error storing vector embeddings: {str(e)}")
             raise
+    async def content_exists(self: Self, key: str)-> bool: # TODO: implement this.
+        """
+        Check if content exists in the database.
+        Args:
+            key: The key to check for content existence
+        """
+        pass
+    async def search_by_similarity(self: Self, query: VectorQuery) -> Sequence[Chunk]:
         """
         Query the vector database for similar documents.
         Args:
             query: VectorQuery object with search parameters
         Returns:
+            Sequence[Chunk]: List of similar chunks
         """
         try:
+            # Get the vector collection
+            vector_collection = await self.mongo_db.get_collection("vectors")
             # Build aggregation pipeline for vector search
             pipeline = [
                 {
                     "$search": {
+                        "index": "vectors_vector_index",
                         "knnBeta": {
+                            "vector": list(query.query_embeddings),
                             "path": "embedding",
                             "k": query.k
                         }
                 },
                 {
                     "$project": {
                         "text": 1,
                         "metadata": 1,
+                        "parent_id": 1,
+                        "chunk_id": 1,
+                        "score": { "$meta": "searchScore" }
                     }
                 }
             ]
                 metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
                 pipeline.insert(1, {"$match": metadata_filter})
+            # Add score threshold filter
+            if query.score_threshold > 0:
+                pipeline.append({
+                    "$match": {
+                        "score": { "$gte": query.score_threshold }
+                    }
+                })
             # Execute the pipeline
+            results = await vector_collection.aggregate(pipeline).to_list(length=query.k)
+            # Convert results to Chunk objects
+            chunks = []
             for result in results:
+                chunk = Chunk(
+                    text=result["text"],
+                    parent_id=result["parent_id"],
+                    chunk_id=result["chunk_id"],
+                    metadata={
+                        **result["metadata"],
+                        "similarity_score": result.get("score", 0)
+                    }
                 )
+                chunks.append(chunk)
+            logger.info(f"Found {len(chunks)} similar chunks with similarity search")
+            return chunks
         except Exception as e:
             logger.error(f"Error in similarity search: {str(e)}")