Spaces:
Running
Running
File size: 2,639 Bytes
55f46c1 12d3e1a 55f46c1 605a49c 6dc173d 605a49c 55f46c1 12d3e1a 55f46c1 78209bc 12d3e1a f8e2c8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
from pydoc import text
from typing import List, Dict, Tuple, Optional
from _utils.models.gerar_relatorio import (
ContextualizedChunk,
)
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
import logging
class VectorStore:
def __init__(self, embedding_model):
self.logger = logging.getLogger(__name__)
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
pass
def create_enhanced_vector_store(
self, chunks: List[ContextualizedChunk], is_contextualized_chunk
) -> Tuple[Chroma, BM25Okapi, List[str]]:
"""Create vector store and BM25 index with contextualized chunks"""
try:
# Prepare texts with context
if is_contextualized_chunk:
texts = [
f"""<one_chunk>
<document_id>{chunk.id_do_processo}</document_id>
<document_context_title>Document_context{chunk.context}</document_context_title>
<document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary>
<document_content>Document_content: {chunk.content}</document_content>
</one_chunk>
"""
for chunk in chunks
]
else:
texts = [f"{chunk.content}" for chunk in chunks]
# Create vector store
metadatas = []
for index, chunk in enumerate(chunks):
context = texts[index]
metadatas.append(
{
"chunk_id": chunk.chunk_id,
"id_do_processo": str(
chunk.id_do_processo
), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
"page": chunk.page_number,
"start_char": chunk.start_char,
"end_char": chunk.end_char,
"context": context,
}
)
vector_store = Chroma.from_texts(
texts=texts, metadatas=metadatas, embedding=self.embeddings
)
# Create BM25 index
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)
# Get chunk IDs in order
chunk_ids = [chunk.chunk_id for chunk in chunks]
return vector_store, bm25, chunk_ids
except Exception as e:
self.logger.error(f"Error creating enhanced vector store: {str(e)}")
raise Exception(f"Error creating enhanced vector store: {str(e)}")
|