Spaces:
Running
Running
File size: 2,756 Bytes
55f46c1 12d3e1a 588b95c 12d3e1a 93c6cb3 12d3e1a 93c6cb3 12d3e1a 55f46c1 605a49c 93c6cb3 605a49c f45e723 ecc78bf 55f46c1 93c6cb3 12d3e1a 55f46c1 78209bc 12d3e1a f8e2c8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from pydoc import text
from typing import List, Dict, Tuple, Optional
from _utils.models.gerar_documento import (
ContextualizedChunk,
)
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
import logging
from setup.logging import Axiom
class VectorStore:
def __init__(self, embedding_model):
self.logger = logging.getLogger(__name__)
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
pass
def create_enhanced_vector_store(
self,
chunks: List[ContextualizedChunk],
is_contextualized_chunk,
axiom_instance: Axiom,
) -> Tuple[Chroma, BM25Okapi, List[str]]:
"""Create vector store and BM25 index with contextualized chunks"""
try:
# Prepare texts with context
if is_contextualized_chunk:
texts = [
f"""<one_chunk>
<document_id>{chunk.id_do_processo}</document_id>
<document_context_title>{chunk.context}</document_context_title>
<document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary>
<document_content>{chunk.content}</document_content>
</one_chunk>\n"""
for chunk in chunks
]
axiom_instance.send_axiom(f"Chunks gerados: {texts}")
else:
texts = [f"{chunk.content}" for chunk in chunks]
# Create vector store
metadatas = []
for index, chunk in enumerate(chunks):
context = texts[index]
metadatas.append(
{
"chunk_id": chunk.chunk_id,
"id_do_processo": str(
chunk.id_do_processo
), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
"page": chunk.page_number,
"start_char": chunk.start_char,
"end_char": chunk.end_char,
"context": context,
}
)
vector_store = Chroma.from_texts(
texts=texts, metadatas=metadatas, embedding=self.embeddings
)
# Create BM25 index
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)
# Get chunk IDs in order
chunk_ids = [chunk.chunk_id for chunk in chunks]
return vector_store, bm25, chunk_ids
except Exception as e:
self.logger.error(f"Error creating enhanced vector store: {str(e)}")
raise Exception(f"Error creating enhanced vector store: {str(e)}")
|