from pydoc import text from typing import List, Dict, Tuple, Optional from _utils.models.gerar_documento import ( ContextualizedChunk, ) from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings import logging from setup.logging import Axiom class VectorStore: def __init__(self, embedding_model): self.logger = logging.getLogger(__name__) self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) pass def create_enhanced_vector_store( self, chunks: List[ContextualizedChunk], is_contextualized_chunk, axiom_instance: Axiom, ) -> Tuple[Chroma, BM25Okapi, List[str]]: """Create vector store and BM25 index with contextualized chunks""" try: # Prepare texts with context if is_contextualized_chunk: texts = [ f""" {chunk.id_do_processo} {chunk.context} {chunk.contextual_summary} {chunk.content} \n""" for chunk in chunks ] axiom_instance.send_axiom(f"Chunks gerados: {texts}") else: texts = [f"{chunk.content}" for chunk in chunks] # Create vector store metadatas = [] for index, chunk in enumerate(chunks): context = texts[index] metadatas.append( { "chunk_id": chunk.chunk_id, "id_do_processo": str( chunk.id_do_processo ), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar "page": chunk.page_number, "start_char": chunk.start_char, "end_char": chunk.end_char, "context": context, } ) vector_store = Chroma.from_texts( texts=texts, metadatas=metadatas, embedding=self.embeddings ) # Create BM25 index tokenized_texts = [text.split() for text in texts] bm25 = BM25Okapi(tokenized_texts) # Get chunk IDs in order chunk_ids = [chunk.chunk_id for chunk in chunks] return vector_store, bm25, chunk_ids except Exception as e: self.logger.error(f"Error creating enhanced vector store: {str(e)}") raise Exception(f"Error creating enhanced vector store: {str(e)}")