Spaces:

luanpoppe
/

vella-backend

Running

vella-backend / _utils /langchain_utils /Vector_store_class.py

luanpoppe

feat: adicionando OCR em casos de PDFs com problema

edd5b40 4 months ago

2.94 kB

	from pydoc import text
	from typing import List, Dict, Tuple, Optional
	from _utils.models.gerar_documento import (
	ContextualizedChunk,
	)
	from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
	import logging

	from setup.logging import Axiom


	class VectorStore:
	def __init__(self, embedding_model):
	self.logger = logging.getLogger(__name__)
	self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
	pass

	def create_enhanced_vector_store(
	self,
	chunks: List[ContextualizedChunk],
	is_contextualized_chunk,
	axiom_instance: Axiom,
	) -> Tuple[Chroma, BM25Okapi, List[str]]:
	"""Create vector store and BM25 index with contextualized chunks"""
	contador_erro = 0

	try:
	# Prepare texts with context
	if is_contextualized_chunk:
	texts = [
	f"""<one_chunk>
	<document_id>{chunk.id_do_processo}</document_id>
	<document_context_title>{chunk.context}</document_context_title>
	<document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary>
	<document_content>{chunk.content}</document_content>
	</one_chunk>\n"""
	for chunk in chunks
	]
	axiom_instance.send_axiom(f"Chunks gerados: {texts}")
	else:
	texts = [f"{chunk.content}" for chunk in chunks]

	# Create vector store
	metadatas = []
	for index, chunk in enumerate(chunks):
	context = texts[index]
	metadatas.append(
	{
	"chunk_id": chunk.chunk_id,
	"id_do_processo": str(
	chunk.id_do_processo
	), # Se passar o id como um número o código quebra pelo valor inteiro ser maior do que o Chroma consegue lidar
	"page": chunk.page_number,
	"start_char": chunk.start_char,
	"end_char": chunk.end_char,
	"context": context,
	}
	)

	vector_store = Chroma.from_texts(
	texts=texts, metadatas=metadatas, embedding=self.embeddings
	)

	# Create BM25 index
	tokenized_texts = [text.split() for text in texts]
	bm25 = BM25Okapi(tokenized_texts)

	# Get chunk IDs in order
	chunk_ids = [chunk.chunk_id for chunk in chunks]

	return vector_store, bm25, chunk_ids

	except Exception as e:
	contador_erro += 1
	if contador_erro >= 2:
	raise Exception(f"Error creating enhanced vector store: {str(e)}")

	self.logger.error(f"Error creating enhanced vector store: {str(e)}")
	return self.create_enhanced_vector_store(chunks, False, axiom_instance)