vella-backend / _utils /langchain_utils /Vector_store_class.py
luanpoppe
fix: minor fixx
ecc78bf
from pydoc import text
from typing import List, Dict, Tuple, Optional
from _utils.models.gerar_relatorio import (
ContextualizedChunk,
)
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
import logging
from setup.logging import Axiom
class VectorStore:
def __init__(self, embedding_model):
self.logger = logging.getLogger(__name__)
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
pass
def create_enhanced_vector_store(
self,
chunks: List[ContextualizedChunk],
is_contextualized_chunk,
axiom_instance: Axiom,
) -> Tuple[Chroma, BM25Okapi, List[str]]:
"""Create vector store and BM25 index with contextualized chunks"""
try:
# Prepare texts with context
if is_contextualized_chunk:
texts = [
f"""<one_chunk>
<document_id>{chunk.id_do_processo}</document_id>
<document_context_title>{chunk.context}</document_context_title>
<document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary>
<document_content>{chunk.content}</document_content>
</one_chunk>\n"""
for chunk in chunks
]
axiom_instance.send_axiom(f"Chunks gerados: {texts}")
else:
texts = [f"{chunk.content}" for chunk in chunks]
# Create vector store
metadatas = []
for index, chunk in enumerate(chunks):
context = texts[index]
metadatas.append(
{
"chunk_id": chunk.chunk_id,
"id_do_processo": str(
chunk.id_do_processo
), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
"page": chunk.page_number,
"start_char": chunk.start_char,
"end_char": chunk.end_char,
"context": context,
}
)
vector_store = Chroma.from_texts(
texts=texts, metadatas=metadatas, embedding=self.embeddings
)
# Create BM25 index
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)
# Get chunk IDs in order
chunk_ids = [chunk.chunk_id for chunk in chunks]
return vector_store, bm25, chunk_ids
except Exception as e:
self.logger.error(f"Error creating enhanced vector store: {str(e)}")
raise Exception(f"Error creating enhanced vector store: {str(e)}")