Spaces:

luanpoppe
/

vella-backend-tests

Running

File size: 2,938 Bytes

55f46c1
12d3e1a
588b95c
12d3e1a
 
 
 
 
93c6cb3
 
12d3e1a
 
 
 
 
 
 
 
93c6cb3
 
 
 
12d3e1a
 
edd5b40
 
12d3e1a
 
 
55f46c1
605a49c
 
93c6cb3
605a49c
f45e723
ecc78bf
55f46c1
 
93c6cb3
12d3e1a
 
 
 
 
55f46c1
78209bc
 
 
 
 
 
 
 
 
 
 
 
 
12d3e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edd5b40
 
 
 
12d3e1a
edd5b40

from pydoc import text
from typing import List, Dict, Tuple, Optional
from _utils.models.gerar_documento import (
    ContextualizedChunk,
)
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
import logging

from setup.logging import Axiom


class VectorStore:
    def __init__(self, embedding_model):
        self.logger = logging.getLogger(__name__)
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        pass

    def create_enhanced_vector_store(
        self,
        chunks: List[ContextualizedChunk],
        is_contextualized_chunk,
        axiom_instance: Axiom,
    ) -> Tuple[Chroma, BM25Okapi, List[str]]:
        """Create vector store and BM25 index with contextualized chunks"""
        contador_erro = 0

        try:
            # Prepare texts with context
            if is_contextualized_chunk:
                texts = [
                    f"""<one_chunk>
    <document_id>{chunk.id_do_processo}</document_id>
    <document_context_title>{chunk.context}</document_context_title>
    <document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary>
    <document_content>{chunk.content}</document_content>
</one_chunk>\n"""
                    for chunk in chunks
                ]
                axiom_instance.send_axiom(f"Chunks gerados: {texts}")
            else:
                texts = [f"{chunk.content}" for chunk in chunks]

            # Create vector store
            metadatas = []
            for index, chunk in enumerate(chunks):
                context = texts[index]
                metadatas.append(
                    {
                        "chunk_id": chunk.chunk_id,
                        "id_do_processo": str(
                            chunk.id_do_processo
                        ),  # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
                        "page": chunk.page_number,
                        "start_char": chunk.start_char,
                        "end_char": chunk.end_char,
                        "context": context,
                    }
                )

            vector_store = Chroma.from_texts(
                texts=texts, metadatas=metadatas, embedding=self.embeddings
            )

            # Create BM25 index
            tokenized_texts = [text.split() for text in texts]
            bm25 = BM25Okapi(tokenized_texts)

            # Get chunk IDs in order
            chunk_ids = [chunk.chunk_id for chunk in chunks]

            return vector_store, bm25, chunk_ids

        except Exception as e:
            contador_erro += 1
            if contador_erro >= 2:
                raise Exception(f"Error creating enhanced vector store: {str(e)}")

            self.logger.error(f"Error creating enhanced vector store: {str(e)}")
            return self.create_enhanced_vector_store(chunks, False, axiom_instance)