File size: 2,756 Bytes
55f46c1
12d3e1a
588b95c
12d3e1a
 
 
 
 
93c6cb3
 
12d3e1a
 
 
 
 
 
 
 
93c6cb3
 
 
 
12d3e1a
 
 
 
 
55f46c1
605a49c
 
93c6cb3
605a49c
f45e723
ecc78bf
55f46c1
 
93c6cb3
12d3e1a
 
 
 
 
55f46c1
78209bc
 
 
 
 
 
 
 
 
 
 
 
 
12d3e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8e2c8b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from pydoc import text
from typing import List, Dict, Tuple, Optional
from _utils.models.gerar_documento import (
    ContextualizedChunk,
)
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
import logging

from setup.logging import Axiom


class VectorStore:
    def __init__(self, embedding_model):
        self.logger = logging.getLogger(__name__)
        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        pass

    def create_enhanced_vector_store(
        self,
        chunks: List[ContextualizedChunk],
        is_contextualized_chunk,
        axiom_instance: Axiom,
    ) -> Tuple[Chroma, BM25Okapi, List[str]]:
        """Create vector store and BM25 index with contextualized chunks"""
        try:
            # Prepare texts with context
            if is_contextualized_chunk:
                texts = [
                    f"""<one_chunk>
    <document_id>{chunk.id_do_processo}</document_id>
    <document_context_title>{chunk.context}</document_context_title>
    <document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary>
    <document_content>{chunk.content}</document_content>
</one_chunk>\n"""
                    for chunk in chunks
                ]
                axiom_instance.send_axiom(f"Chunks gerados: {texts}")
            else:
                texts = [f"{chunk.content}" for chunk in chunks]

            # Create vector store
            metadatas = []
            for index, chunk in enumerate(chunks):
                context = texts[index]
                metadatas.append(
                    {
                        "chunk_id": chunk.chunk_id,
                        "id_do_processo": str(
                            chunk.id_do_processo
                        ),  # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
                        "page": chunk.page_number,
                        "start_char": chunk.start_char,
                        "end_char": chunk.end_char,
                        "context": context,
                    }
                )

            vector_store = Chroma.from_texts(
                texts=texts, metadatas=metadatas, embedding=self.embeddings
            )

            # Create BM25 index
            tokenized_texts = [text.split() for text in texts]
            bm25 = BM25Okapi(tokenized_texts)

            # Get chunk IDs in order
            chunk_ids = [chunk.chunk_id for chunk in chunks]

            return vector_store, bm25, chunk_ids

        except Exception as e:
            self.logger.error(f"Error creating enhanced vector store: {str(e)}")
            raise Exception(f"Error creating enhanced vector store: {str(e)}")