Spaces:
Running
Running
from pydoc import text | |
from typing import List, Dict, Tuple, Optional | |
from _utils.models.gerar_documento import ( | |
ContextualizedChunk, | |
) | |
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings | |
import logging | |
from setup.logging import Axiom | |
class VectorStore: | |
def __init__(self, embedding_model): | |
self.logger = logging.getLogger(__name__) | |
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
pass | |
def create_enhanced_vector_store( | |
self, | |
chunks: List[ContextualizedChunk], | |
is_contextualized_chunk, | |
axiom_instance: Axiom, | |
) -> Tuple[Chroma, BM25Okapi, List[str]]: | |
"""Create vector store and BM25 index with contextualized chunks""" | |
try: | |
# Prepare texts with context | |
if is_contextualized_chunk: | |
texts = [ | |
f"""<one_chunk> | |
<document_id>{chunk.id_do_processo}</document_id> | |
<document_context_title>{chunk.context}</document_context_title> | |
<document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary> | |
<document_content>{chunk.content}</document_content> | |
</one_chunk>\n""" | |
for chunk in chunks | |
] | |
axiom_instance.send_axiom(f"Chunks gerados: {texts}") | |
else: | |
texts = [f"{chunk.content}" for chunk in chunks] | |
# Create vector store | |
metadatas = [] | |
for index, chunk in enumerate(chunks): | |
context = texts[index] | |
metadatas.append( | |
{ | |
"chunk_id": chunk.chunk_id, | |
"id_do_processo": str( | |
chunk.id_do_processo | |
), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar | |
"page": chunk.page_number, | |
"start_char": chunk.start_char, | |
"end_char": chunk.end_char, | |
"context": context, | |
} | |
) | |
vector_store = Chroma.from_texts( | |
texts=texts, metadatas=metadatas, embedding=self.embeddings | |
) | |
# Create BM25 index | |
tokenized_texts = [text.split() for text in texts] | |
bm25 = BM25Okapi(tokenized_texts) | |
# Get chunk IDs in order | |
chunk_ids = [chunk.chunk_id for chunk in chunks] | |
return vector_store, bm25, chunk_ids | |
except Exception as e: | |
self.logger.error(f"Error creating enhanced vector store: {str(e)}") | |
raise Exception(f"Error creating enhanced vector store: {str(e)}") | |