Spaces:
Running
Running
| from pydoc import text | |
| from typing import List, Dict, Tuple, Optional | |
| from _utils.models.gerar_documento import ( | |
| ContextualizedChunk, | |
| ) | |
| from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings | |
| import logging | |
| from setup.logging import Axiom | |
| class VectorStore: | |
| def __init__(self, embedding_model): | |
| self.logger = logging.getLogger(__name__) | |
| self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
| pass | |
| def create_enhanced_vector_store( | |
| self, | |
| chunks: List[ContextualizedChunk], | |
| is_contextualized_chunk, | |
| axiom_instance: Axiom, | |
| ) -> Tuple[Chroma, BM25Okapi, List[str]]: | |
| """Create vector store and BM25 index with contextualized chunks""" | |
| contador_erro = 0 | |
| try: | |
| # Prepare texts with context | |
| if is_contextualized_chunk: | |
| texts = [ | |
| f"""<one_chunk> | |
| <document_id>{chunk.id_do_processo}</document_id> | |
| <document_context_title>{chunk.context}</document_context_title> | |
| <document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary> | |
| <document_content>{chunk.content}</document_content> | |
| </one_chunk>\n""" | |
| for chunk in chunks | |
| ] | |
| axiom_instance.send_axiom(f"Chunks gerados: {texts}") | |
| else: | |
| texts = [f"{chunk.content}" for chunk in chunks] | |
| # Create vector store | |
| metadatas = [] | |
| for index, chunk in enumerate(chunks): | |
| context = texts[index] | |
| metadatas.append( | |
| { | |
| "chunk_id": chunk.chunk_id, | |
| "id_do_processo": str( | |
| chunk.id_do_processo | |
| ), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar | |
| "page": chunk.page_number, | |
| "start_char": chunk.start_char, | |
| "end_char": chunk.end_char, | |
| "context": context, | |
| } | |
| ) | |
| vector_store = Chroma.from_texts( | |
| texts=texts, metadatas=metadatas, embedding=self.embeddings | |
| ) | |
| # Create BM25 index | |
| tokenized_texts = [text.split() for text in texts] | |
| bm25 = BM25Okapi(tokenized_texts) | |
| # Get chunk IDs in order | |
| chunk_ids = [chunk.chunk_id for chunk in chunks] | |
| return vector_store, bm25, chunk_ids | |
| except Exception as e: | |
| contador_erro += 1 | |
| if contador_erro >= 2: | |
| raise Exception(f"Error creating enhanced vector store: {str(e)}") | |
| self.logger.error(f"Error creating enhanced vector store: {str(e)}") | |
| return self.create_enhanced_vector_store(chunks, False, axiom_instance) | |