rag-document-system / src /vector_store.py
hugging2021's picture
Upload folder using huggingface_hub
816825a verified
import logging
import os
from typing import List, Optional, Tuple
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
logger = logging.getLogger(__name__)
class VectorStoreManager:
def __init__(self, persist_directory: str = "./chroma_db", embedding_function: Optional[Embeddings] = None):
self.persist_directory = persist_directory
self.embedding_function = embedding_function
self.vector_store = None
self._ensure_persist_directory()
def _ensure_persist_directory(self):
try:
os.makedirs(self.persist_directory, exist_ok=True)
logger.info(f"Persist directory ensured: {self.persist_directory}")
except Exception as e:
logger.error(f"Error creating persist directory: {e}")
raise e
def initialize_vector_store(self, embedding_function: Optional[Embeddings] = None):
if embedding_function:
self.embedding_function = embedding_function
if not self.embedding_function:
raise ValueError("Embedding function must be provided")
try:
logger.info("Initializing vector store")
self.vector_store = Chroma(
persist_directory=self.persist_directory,
embedding_function=self.embedding_function
)
logger.info("Vector store initialized successfully")
except Exception as e:
logger.error(f"Error initializing vector store: {e}")
raise e
def add_documents(self, documents: List[Document]) -> bool:
try:
if not self.vector_store:
raise ValueError("Vector store not initialized")
logger.info(f"Adding {len(documents)} document(s) to vector store")
self.vector_store.add_documents(documents)
logger.info("Documents added successfully")
return True
except Exception as e:
logger.error(f"Error adding documents to vector store: {e}")
return False
def similarity_search(self, query: str, k: int = 5) -> List[Document]:
try:
if not self.vector_store:
raise ValueError("Vector store not initialized")
logger.info(f"Performing similarity search for query: '{query[:50]}...'")
results = self.vector_store.similarity_search(query, k=k)
logger.info(f"Found {len(results)} similar documents")
return results
except Exception as e:
logger.error(f"Error performing similarity search: {e}")
return []
def similarity_search_with_score(self, query: str, k: int = 5) -> List[Tuple[Document, float]]:
try:
if not self.vector_store:
raise ValueError("Vector store not initialized")
logger.info(f"Performing similarity search with scores for query: '{query[:50]}...'")
results = self.vector_store.similarity_search_with_score(query, k=k)
logger.info(f"Found {len(results)} similar documents with scores")
return results
except Exception as e:
logger.error(f"Error performing similarity search with scores: {e}")
return []
def get_retriever(self, search_kwargs: Optional[dict] = None):
try:
if not self.vector_store:
raise ValueError("Vector store not initialized")
default_kwargs = {"k": 5}
if search_kwargs:
default_kwargs.update(search_kwargs)
retriever = self.vector_store.as_retriever(search_kwargs=default_kwargs)
logger.info("Retriever created successfully")
return retriever
except Exception as e:
logger.error(f"Error creating retriever: {e}")
raise e
def get_collection_stats(self) -> dict:
try:
if not self.vector_store:
return {'total_documents': 0, 'collection_name': None}
collection = self.vector_store._collection
count = collection.count()
return {
'total_documents': count,
'collection_name': collection.name,
'persist_directory': self.persist_directory
}
except Exception as e:
logger.error(f"Error getting collection stats: {e}")
return {'total_documents': 0, 'collection_name': None}
def clear_vector_store(self) -> bool:
try:
if not self.vector_store:
return True
logger.info("Clearing vector store")
self.vector_store._collection.delete(where={})
logger.info("Vector store cleared successfully")
return True
except Exception as e:
logger.error(f"Error clearing vector store: {e}")
return False
def is_initialized(self) -> bool:
return self.vector_store is not None