Spaces:
Runtime error
Runtime error
Merge branch 'refactor-2' into refactor-3
Browse files- .github/workflows/main.yml +1 -1
- Dockerfile +2 -2
- pyproject.toml +1 -0
- src/ctp_slack_bot/core/logging.py +10 -10
- src/ctp_slack_bot/db/mongo_db.py +95 -44
- src/ctp_slack_bot/services/content_ingestion_service.py +6 -6
- src/ctp_slack_bot/services/language_model_service.py +18 -18
- src/ctp_slack_bot/services/vector_database_service.py +42 -15
.github/workflows/main.yml
CHANGED
|
@@ -16,4 +16,4 @@ jobs:
|
|
| 16 |
- name: Push to hub
|
| 17 |
env:
|
| 18 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
-
run: git push https://
|
|
|
|
| 16 |
- name: Push to hub
|
| 17 |
env:
|
| 18 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 19 |
+
run: git push https://KingZack:[email protected]/spaces/KingZack/ctp-slack-bot main
|
Dockerfile
CHANGED
|
@@ -25,8 +25,8 @@ RUN pip install --no-cache-dir .
|
|
| 25 |
RUN useradd -m appuser
|
| 26 |
USER appuser
|
| 27 |
|
| 28 |
-
# Expose a volume mount for logs
|
| 29 |
-
VOLUME
|
| 30 |
|
| 31 |
# Run the application.
|
| 32 |
CMD ["python", "-m", "ctp_slack_bot.app"]
|
|
|
|
| 25 |
RUN useradd -m appuser
|
| 26 |
USER appuser
|
| 27 |
|
| 28 |
+
# Expose a volume mount for logs ― Hugging Face Spaces requires specifically /data.
|
| 29 |
+
VOLUME /data
|
| 30 |
|
| 31 |
# Run the application.
|
| 32 |
CMD ["python", "-m", "ctp_slack_bot.app"]
|
pyproject.toml
CHANGED
|
@@ -34,6 +34,7 @@ dependencies = [
|
|
| 34 |
"webvtt-py>=0.5.1",
|
| 35 |
"slack-sdk>=3.35.0",
|
| 36 |
"slack_bolt>=1.23.0",
|
|
|
|
| 37 |
"motor>=3.7.0",
|
| 38 |
"openai>=1.70.0"
|
| 39 |
"google-api-python-client>=2.167.0",
|
|
|
|
| 34 |
"webvtt-py>=0.5.1",
|
| 35 |
"slack-sdk>=3.35.0",
|
| 36 |
"slack_bolt>=1.23.0",
|
| 37 |
+
"pymongo>=4.11.3 ",
|
| 38 |
"motor>=3.7.0",
|
| 39 |
"openai>=1.70.0"
|
| 40 |
"google-api-python-client>=2.167.0",
|
src/ctp_slack_bot/core/logging.py
CHANGED
|
@@ -75,16 +75,16 @@ def setup_logging() -> None:
|
|
| 75 |
)
|
| 76 |
|
| 77 |
# Add file handler for non-DEBUG environments.
|
| 78 |
-
if log_level != "DEBUG":
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
# Intercept standard logging messages.
|
| 90 |
basicConfig(handlers=[InterceptHandler()], level=0, force=True)
|
|
|
|
| 75 |
)
|
| 76 |
|
| 77 |
# Add file handler for non-DEBUG environments.
|
| 78 |
+
# if log_level != "DEBUG":
|
| 79 |
+
# logger.add(
|
| 80 |
+
# "/data/app.log",
|
| 81 |
+
# rotation="10 MB",
|
| 82 |
+
# retention="1 week",
|
| 83 |
+
# compression="zip",
|
| 84 |
+
# format=format_string,
|
| 85 |
+
# level=log_level,
|
| 86 |
+
# serialize=(log_format == "json"),
|
| 87 |
+
# )
|
| 88 |
|
| 89 |
# Intercept standard logging messages.
|
| 90 |
basicConfig(handlers=[InterceptHandler()], level=0, force=True)
|
src/ctp_slack_bot/db/mongo_db.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from dependency_injector.resources import Resource
|
| 2 |
from motor.motor_asyncio import AsyncIOMotorClient
|
| 3 |
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
|
|
|
| 4 |
from loguru import logger
|
| 5 |
from pydantic import BaseModel, PrivateAttr
|
| 6 |
from typing import Any, Dict, Optional, Self
|
|
@@ -27,6 +28,10 @@ class MongoDB(BaseModel):
|
|
| 27 |
"""Initialize MongoDB client with settings."""
|
| 28 |
try:
|
| 29 |
connection_string = self.settings.MONGODB_URI.get_secret_value()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# Create client with appropriate settings
|
| 32 |
self._client = AsyncIOMotorClient(
|
|
@@ -54,86 +59,108 @@ class MongoDB(BaseModel):
|
|
| 54 |
@property
|
| 55 |
def client(self: Self) -> AsyncIOMotorClient:
|
| 56 |
"""Get the MongoDB client instance."""
|
| 57 |
-
if self._client is None:
|
| 58 |
logger.warning("MongoDB client not initialized. Attempting to initialize.")
|
| 59 |
self.connect()
|
| 60 |
-
if self._client is None:
|
| 61 |
raise ConnectionError("Failed to initialize MongoDB client")
|
| 62 |
return self._client
|
| 63 |
|
| 64 |
@property
|
| 65 |
def db(self: Self) -> Any:
|
| 66 |
"""Get the MongoDB database instance."""
|
| 67 |
-
if self._db is None:
|
| 68 |
logger.warning("MongoDB database not initialized. Attempting to initialize client.")
|
| 69 |
self.connect()
|
| 70 |
-
if self._db is None:
|
| 71 |
raise ConnectionError("Failed to initialize MongoDB database")
|
| 72 |
return self._db
|
| 73 |
|
| 74 |
async def ping(self: Self) -> bool:
|
| 75 |
"""Check if MongoDB connection is alive."""
|
| 76 |
try:
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
return True
|
| 79 |
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
|
| 80 |
logger.error("MongoDB connection failed: {}", e)
|
| 81 |
return False
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
async def get_collection(self: Self, name: str) -> Any:
|
| 84 |
"""
|
| 85 |
Get a collection by name with validation.
|
| 86 |
Creates the collection if it doesn't exist.
|
| 87 |
"""
|
|
|
|
| 88 |
if not await self.ping():
|
|
|
|
| 89 |
raise ConnectionError("MongoDB connection is not available")
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
# Create the collection
|
| 96 |
-
await self.db.create_collection(name)
|
| 97 |
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
async def create_indexes(self: Self, collection_name: str
|
| 101 |
"""
|
| 102 |
-
Create
|
| 103 |
-
|
| 104 |
-
|
|
|
|
| 105 |
"""
|
| 106 |
collection = await self.get_collection(collection_name)
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
"embedding": {
|
| 120 |
-
"type": "knnVector",
|
| 121 |
-
"dimensions": self.settings.VECTOR_DIMENSION,
|
| 122 |
-
"similarity": "cosine"
|
| 123 |
-
}
|
| 124 |
}
|
| 125 |
-
|
| 126 |
-
}
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
logger.error("Failed to create vector index: {}", e)
|
| 138 |
raise
|
| 139 |
|
|
@@ -147,6 +174,30 @@ class MongoDB(BaseModel):
|
|
| 147 |
|
| 148 |
class MongoDBResource(Resource):
|
| 149 |
def init(self: Self, settings: Settings) -> MongoDB:
|
|
|
|
| 150 |
mongo_db = MongoDB(settings=settings)
|
| 151 |
mongo_db.connect()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
return mongo_db
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from dependency_injector.resources import Resource
|
| 2 |
from motor.motor_asyncio import AsyncIOMotorClient
|
| 3 |
from pymongo.errors import ConnectionFailure, ServerSelectionTimeoutError
|
| 4 |
+
from pymongo.operations import SearchIndexModel
|
| 5 |
from loguru import logger
|
| 6 |
from pydantic import BaseModel, PrivateAttr
|
| 7 |
from typing import Any, Dict, Optional, Self
|
|
|
|
| 28 |
"""Initialize MongoDB client with settings."""
|
| 29 |
try:
|
| 30 |
connection_string = self.settings.MONGODB_URI.get_secret_value()
|
| 31 |
+
logger.debug("Connecting to MongoDB using URI: {}", connection_string.replace(
|
| 32 |
+
connection_string.split('@')[-1].split('/')[0] if '@' in connection_string else '',
|
| 33 |
+
'[REDACTED]'
|
| 34 |
+
))
|
| 35 |
|
| 36 |
# Create client with appropriate settings
|
| 37 |
self._client = AsyncIOMotorClient(
|
|
|
|
| 59 |
@property
|
| 60 |
def client(self: Self) -> AsyncIOMotorClient:
|
| 61 |
"""Get the MongoDB client instance."""
|
| 62 |
+
if not hasattr(self, '_client') or self._client is None:
|
| 63 |
logger.warning("MongoDB client not initialized. Attempting to initialize.")
|
| 64 |
self.connect()
|
| 65 |
+
if not hasattr(self, '_client') or self._client is None:
|
| 66 |
raise ConnectionError("Failed to initialize MongoDB client")
|
| 67 |
return self._client
|
| 68 |
|
| 69 |
@property
|
| 70 |
def db(self: Self) -> Any:
|
| 71 |
"""Get the MongoDB database instance."""
|
| 72 |
+
if not hasattr(self, '_db') or self._db is None:
|
| 73 |
logger.warning("MongoDB database not initialized. Attempting to initialize client.")
|
| 74 |
self.connect()
|
| 75 |
+
if not hasattr(self, '_db') or self._db is None:
|
| 76 |
raise ConnectionError("Failed to initialize MongoDB database")
|
| 77 |
return self._db
|
| 78 |
|
| 79 |
async def ping(self: Self) -> bool:
|
| 80 |
"""Check if MongoDB connection is alive."""
|
| 81 |
try:
|
| 82 |
+
# Get client to ensure we're connected
|
| 83 |
+
client = self.client
|
| 84 |
+
|
| 85 |
+
# Try a simple ping command
|
| 86 |
+
await client.admin.command('ping')
|
| 87 |
+
logger.debug("MongoDB connection is active")
|
| 88 |
return True
|
| 89 |
except (ConnectionFailure, ServerSelectionTimeoutError) as e:
|
| 90 |
logger.error("MongoDB connection failed: {}", e)
|
| 91 |
return False
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.error("Unexpected error during MongoDB ping: {}", e)
|
| 94 |
+
return False
|
| 95 |
|
| 96 |
async def get_collection(self: Self, name: str) -> Any:
|
| 97 |
"""
|
| 98 |
Get a collection by name with validation.
|
| 99 |
Creates the collection if it doesn't exist.
|
| 100 |
"""
|
| 101 |
+
# First ensure we can connect at all
|
| 102 |
if not await self.ping():
|
| 103 |
+
logger.error("Cannot get collection '{}' - MongoDB connection is not available", name)
|
| 104 |
raise ConnectionError("MongoDB connection is not available")
|
| 105 |
|
| 106 |
+
try:
|
| 107 |
+
# Get all collection names to check if this one exists
|
| 108 |
+
logger.debug("Checking if collection '{}' exists", name)
|
| 109 |
+
collection_names = await self.db.list_collection_names()
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
if name not in collection_names:
|
| 112 |
+
logger.info("Collection '{}' does not exist. Creating it.", name)
|
| 113 |
+
# Create the collection
|
| 114 |
+
await self.db.create_collection(name)
|
| 115 |
+
logger.debug("Successfully created collection '{}'", name)
|
| 116 |
+
else:
|
| 117 |
+
logger.debug("Collection '{}' already exists", name)
|
| 118 |
+
|
| 119 |
+
# Get and return the collection
|
| 120 |
+
collection = self.db[name]
|
| 121 |
+
return collection
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.error("Error accessing collection '{}': {}", name, e)
|
| 124 |
+
raise
|
| 125 |
|
| 126 |
+
async def create_indexes(self: Self, collection_name: str) -> None:
|
| 127 |
"""
|
| 128 |
+
Create a vector search index on a collection.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
collection_name: Name of the collection
|
| 132 |
"""
|
| 133 |
collection = await self.get_collection(collection_name)
|
| 134 |
|
| 135 |
+
try:
|
| 136 |
+
# Create search index model using MongoDB's recommended approach
|
| 137 |
+
search_index_model = SearchIndexModel(
|
| 138 |
+
definition={
|
| 139 |
+
"fields": [
|
| 140 |
+
{
|
| 141 |
+
"type": "vector",
|
| 142 |
+
"path": "embedding",
|
| 143 |
+
"numDimensions": self.settings.VECTOR_DIMENSION,
|
| 144 |
+
"similarity": "cosine",
|
| 145 |
+
"quantization": "scalar"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
}
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
name=f"{collection_name}_vector_index",
|
| 150 |
+
type="vectorSearch"
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Create the search index using the motor collection
|
| 154 |
+
result = await collection.create_search_index(search_index_model)
|
| 155 |
+
logger.info("Vector search index '{}' created for collection {}", result, collection_name)
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
if "command not found" in str(e).lower():
|
| 159 |
+
logger.warning("Vector search not supported by this MongoDB instance. Some functionality may be limited.")
|
| 160 |
+
# Create a fallback standard index on embedding field
|
| 161 |
+
await collection.create_index("embedding")
|
| 162 |
+
logger.info("Created standard index on 'embedding' field as fallback")
|
| 163 |
+
else:
|
| 164 |
logger.error("Failed to create vector index: {}", e)
|
| 165 |
raise
|
| 166 |
|
|
|
|
| 174 |
|
| 175 |
class MongoDBResource(Resource):
|
| 176 |
def init(self: Self, settings: Settings) -> MongoDB:
|
| 177 |
+
logger.info("Initializing MongoDB connection for database: {}", settings.MONGODB_NAME)
|
| 178 |
mongo_db = MongoDB(settings=settings)
|
| 179 |
mongo_db.connect()
|
| 180 |
+
|
| 181 |
+
# Test the connection asynchronously - this will run after init returns
|
| 182 |
+
asyncio.create_task(self._test_connection(mongo_db))
|
| 183 |
+
|
| 184 |
return mongo_db
|
| 185 |
+
|
| 186 |
+
async def _test_connection(self, mongo_db: MongoDB) -> None:
|
| 187 |
+
"""Test MongoDB connection and log the result."""
|
| 188 |
+
try:
|
| 189 |
+
is_connected = await mongo_db.ping()
|
| 190 |
+
if is_connected:
|
| 191 |
+
logger.info("MongoDB connection test successful!")
|
| 192 |
+
else:
|
| 193 |
+
logger.error("MongoDB connection test failed!")
|
| 194 |
+
except Exception as e:
|
| 195 |
+
logger.error("Error testing MongoDB connection: {}", e)
|
| 196 |
+
|
| 197 |
+
async def shutdown(self: Self, mongo_db: MongoDB) -> None:
|
| 198 |
+
"""Close MongoDB connection on shutdown."""
|
| 199 |
+
try:
|
| 200 |
+
logger.info("Closing MongoDB connection...")
|
| 201 |
+
await mongo_db.close()
|
| 202 |
+
except Exception as e:
|
| 203 |
+
logger.error("Error closing MongoDB connection: {}", e)
|
src/ctp_slack_bot/services/content_ingestion_service.py
CHANGED
|
@@ -34,16 +34,16 @@ class ContentIngestionService(BaseModel):
|
|
| 34 |
# logger.debug("Ignored content with ID {} because it already exists in the database.", content.id)
|
| 35 |
# return
|
| 36 |
chunks = content.get_chunks()
|
| 37 |
-
self.__vectorize_and_store_chunks_in_database(chunks)
|
| 38 |
logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
|
| 39 |
|
| 40 |
async def process_incoming_slack_message(self: Self, slack_message: SlackMessage) -> None:
|
| 41 |
logger.debug("Content ingestion service received a Slack message: {}", slack_message.text)
|
| 42 |
chunks = slack_message.get_chunks()
|
| 43 |
-
self.__vectorize_and_store_chunks_in_database(chunks)
|
| 44 |
logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
|
| 45 |
|
| 46 |
-
def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
| 34 |
# logger.debug("Ignored content with ID {} because it already exists in the database.", content.id)
|
| 35 |
# return
|
| 36 |
chunks = content.get_chunks()
|
| 37 |
+
await self.__vectorize_and_store_chunks_in_database(chunks)
|
| 38 |
logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
|
| 39 |
|
| 40 |
async def process_incoming_slack_message(self: Self, slack_message: SlackMessage) -> None:
|
| 41 |
logger.debug("Content ingestion service received a Slack message: {}", slack_message.text)
|
| 42 |
chunks = slack_message.get_chunks()
|
| 43 |
+
await self.__vectorize_and_store_chunks_in_database(chunks)
|
| 44 |
logger.debug("Stored {} vectorized chunk(s) in the database.", len(chunks))
|
| 45 |
|
| 46 |
+
async def __vectorize_and_store_chunks_in_database(self: Self, chunks: Sequence[Chunk]) -> None:
|
| 47 |
+
vectorized_chunks = self.vectorization_service.vectorize(chunks) # TODO
|
| 48 |
+
await self.vector_database_service.store(vectorized_chunks) # TODO
|
| 49 |
+
|
src/ctp_slack_bot/services/language_model_service.py
CHANGED
|
@@ -34,22 +34,22 @@ class LanguageModelService(BaseModel):
|
|
| 34 |
str: Generated answer
|
| 35 |
"""
|
| 36 |
logger.debug("Generating response for question “{}” using {} context chunks…", question, len(context))
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
|
| 54 |
-
|
| 55 |
-
return f"Mock response to “{question}”"
|
|
|
|
| 34 |
str: Generated answer
|
| 35 |
"""
|
| 36 |
logger.debug("Generating response for question “{}” using {} context chunks…", question, len(context))
|
| 37 |
+
messages = [
|
| 38 |
+
{"role": "system", "content": self.settings.SYSTEM_PROMPT},
|
| 39 |
+
{"role": "user", "content":
|
| 40 |
+
f"""Student Question: {question}
|
| 41 |
+
|
| 42 |
+
Context from class materials and transcripts:
|
| 43 |
+
{'\n'.join(chunk.text for chunk in context)}
|
| 44 |
+
|
| 45 |
+
Please answer the Student Question based on the Context from class materials and transcripts. If the context doesn’t contain relevant information, acknowledge that and suggest asking the professor."""}
|
| 46 |
+
]
|
| 47 |
+
response: ChatCompletion = self._open_ai_client.chat.completions.create(
|
| 48 |
+
model=self.settings.CHAT_MODEL,
|
| 49 |
+
messages=messages,
|
| 50 |
+
max_tokens=self.settings.MAX_TOKENS,
|
| 51 |
+
temperature=self.settings.TEMPERATURE
|
| 52 |
+
)
|
| 53 |
|
| 54 |
+
return response.choices[0].message.content
|
| 55 |
+
# return f"Mock response to “{question}”"
|
src/ctp_slack_bot/services/vector_database_service.py
CHANGED
|
@@ -35,9 +35,11 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
| 35 |
|
| 36 |
try:
|
| 37 |
# Get the vector collection - this will create it if it doesn't exist
|
|
|
|
| 38 |
vector_collection = await self.mongo_db.get_collection("vectors")
|
| 39 |
|
| 40 |
# Ensure vector search index exists
|
|
|
|
| 41 |
await self.mongo_db.create_indexes("vectors")
|
| 42 |
|
| 43 |
# Create documents to store, ensuring compatibility with BSON
|
|
@@ -57,11 +59,15 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
| 57 |
documents.append(document)
|
| 58 |
|
| 59 |
# Insert into collection as a batch
|
|
|
|
| 60 |
result = await vector_collection.insert_many(documents)
|
| 61 |
-
logger.info(
|
| 62 |
|
| 63 |
except Exception as e:
|
| 64 |
-
logger.error(
|
|
|
|
|
|
|
|
|
|
| 65 |
raise
|
| 66 |
|
| 67 |
async def content_exists(self: Self, key: str)-> bool: # TODO: implement this.
|
|
@@ -85,18 +91,19 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
| 85 |
"""
|
| 86 |
try:
|
| 87 |
# Get the vector collection
|
|
|
|
| 88 |
vector_collection = await self.mongo_db.get_collection("vectors")
|
| 89 |
|
| 90 |
-
# Build aggregation pipeline for vector search
|
|
|
|
| 91 |
pipeline = [
|
| 92 |
{
|
| 93 |
-
"$
|
| 94 |
"index": "vectors_vector_index",
|
| 95 |
-
"
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
}
|
| 100 |
}
|
| 101 |
},
|
| 102 |
{
|
|
@@ -105,7 +112,7 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
| 105 |
"metadata": 1,
|
| 106 |
"parent_id": 1,
|
| 107 |
"chunk_id": 1,
|
| 108 |
-
"score": { "$meta": "
|
| 109 |
}
|
| 110 |
}
|
| 111 |
]
|
|
@@ -114,17 +121,32 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
| 114 |
if query.filter_metadata:
|
| 115 |
metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
|
| 116 |
pipeline.insert(1, {"$match": metadata_filter})
|
|
|
|
| 117 |
|
| 118 |
-
# Add score threshold filter
|
| 119 |
if query.score_threshold > 0:
|
| 120 |
pipeline.append({
|
| 121 |
"$match": {
|
| 122 |
"score": { "$gte": query.score_threshold }
|
| 123 |
}
|
| 124 |
})
|
|
|
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# Convert results to Chunk objects
|
| 130 |
chunks = []
|
|
@@ -140,9 +162,14 @@ class VectorDatabaseService(BaseModel): # TODO: this should not rely specificall
|
|
| 140 |
)
|
| 141 |
chunks.append(chunk)
|
| 142 |
|
| 143 |
-
logger.info(
|
| 144 |
return chunks
|
| 145 |
|
| 146 |
except Exception as e:
|
| 147 |
-
logger.error(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
raise
|
|
|
|
| 35 |
|
| 36 |
try:
|
| 37 |
# Get the vector collection - this will create it if it doesn't exist
|
| 38 |
+
logger.debug("Getting vectors collection for storing {} chunks", len(chunks))
|
| 39 |
vector_collection = await self.mongo_db.get_collection("vectors")
|
| 40 |
|
| 41 |
# Ensure vector search index exists
|
| 42 |
+
logger.debug("Creating vector search index for vectors collection")
|
| 43 |
await self.mongo_db.create_indexes("vectors")
|
| 44 |
|
| 45 |
# Create documents to store, ensuring compatibility with BSON
|
|
|
|
| 59 |
documents.append(document)
|
| 60 |
|
| 61 |
# Insert into collection as a batch
|
| 62 |
+
logger.debug("Inserting {} documents into vectors collection", len(documents))
|
| 63 |
result = await vector_collection.insert_many(documents)
|
| 64 |
+
logger.info("Stored {} vector chunks in database", len(result.inserted_ids))
|
| 65 |
|
| 66 |
except Exception as e:
|
| 67 |
+
logger.error("Error storing vector embeddings: {}", str(e))
|
| 68 |
+
# Include more diagnostic information
|
| 69 |
+
logger.debug("MongoDB connection info: URI defined: {}, DB name: {}",
|
| 70 |
+
bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
|
| 71 |
raise
|
| 72 |
|
| 73 |
async def content_exists(self: Self, key: str)-> bool: # TODO: implement this.
|
|
|
|
| 91 |
"""
|
| 92 |
try:
|
| 93 |
# Get the vector collection
|
| 94 |
+
logger.debug("Getting vectors collection for similarity search")
|
| 95 |
vector_collection = await self.mongo_db.get_collection("vectors")
|
| 96 |
|
| 97 |
+
# Build aggregation pipeline for vector search using official MongoDB format
|
| 98 |
+
logger.debug("Building vector search pipeline with query embedding dimension: {}", len(query.query_embeddings))
|
| 99 |
pipeline = [
|
| 100 |
{
|
| 101 |
+
"$vectorSearch": {
|
| 102 |
"index": "vectors_vector_index",
|
| 103 |
+
"path": "embedding",
|
| 104 |
+
"queryVector": query.query_embeddings, #list(query.query_embeddings),
|
| 105 |
+
"numCandidates": query.k * 10,
|
| 106 |
+
"limit": query.k
|
|
|
|
| 107 |
}
|
| 108 |
},
|
| 109 |
{
|
|
|
|
| 112 |
"metadata": 1,
|
| 113 |
"parent_id": 1,
|
| 114 |
"chunk_id": 1,
|
| 115 |
+
"score": { "$meta": "vectorSearchScore" }
|
| 116 |
}
|
| 117 |
}
|
| 118 |
]
|
|
|
|
| 121 |
if query.filter_metadata:
|
| 122 |
metadata_filter = {f"metadata.{k}": v for k, v in query.filter_metadata.items()}
|
| 123 |
pipeline.insert(1, {"$match": metadata_filter})
|
| 124 |
+
logger.debug("Added metadata filters to search: {}", query.filter_metadata)
|
| 125 |
|
| 126 |
+
# Add score threshold filter if needed
|
| 127 |
if query.score_threshold > 0:
|
| 128 |
pipeline.append({
|
| 129 |
"$match": {
|
| 130 |
"score": { "$gte": query.score_threshold }
|
| 131 |
}
|
| 132 |
})
|
| 133 |
+
logger.debug("Added score threshold filter: {}", query.score_threshold)
|
| 134 |
|
| 135 |
+
try:
|
| 136 |
+
# Execute the vector search pipeline
|
| 137 |
+
logger.debug("Executing vector search pipeline")
|
| 138 |
+
results = await vector_collection.aggregate(pipeline).to_list(length=query.k)
|
| 139 |
+
logger.debug("Vector search returned {} results", len(results))
|
| 140 |
+
except Exception as e:
|
| 141 |
+
logger.warning("Vector search failed: {}. Falling back to basic text search.", str(e))
|
| 142 |
+
# Fall back to basic filtering with limit
|
| 143 |
+
query_filter = {}
|
| 144 |
+
if query.filter_metadata:
|
| 145 |
+
query_filter.update({f"metadata.{k}": v for k, v in query.filter_metadata.items()})
|
| 146 |
+
|
| 147 |
+
logger.debug("Executing fallback basic search with filter: {}", query_filter)
|
| 148 |
+
results = await vector_collection.find(query_filter).limit(query.k).to_list(length=query.k)
|
| 149 |
+
logger.debug("Fallback search returned {} results", len(results))
|
| 150 |
|
| 151 |
# Convert results to Chunk objects
|
| 152 |
chunks = []
|
|
|
|
| 162 |
)
|
| 163 |
chunks.append(chunk)
|
| 164 |
|
| 165 |
+
logger.info("Found {} similar chunks with similarity search", len(chunks))
|
| 166 |
return chunks
|
| 167 |
|
| 168 |
except Exception as e:
|
| 169 |
+
logger.error("Error in similarity search: {}", str(e))
|
| 170 |
+
# Include additional diagnostic information
|
| 171 |
+
logger.debug("MongoDB connection info: URI defined: {}, DB name: {}",
|
| 172 |
+
bool(self.settings.MONGODB_URI), self.settings.MONGODB_NAME)
|
| 173 |
+
logger.debug("Query details: k={}, dimension={}",
|
| 174 |
+
query.k, len(query.query_embeddings) if query.query_embeddings else "None")
|
| 175 |
raise
|