import os from typing import List, Tuple from langchain_core.documents import Document from odf.opendocument import load from odf.text import P from typing import List from setup.easy_imports import ( PyPDFLoader, RecursiveCharacterTextSplitter, ) class SplitterUtils: def get_file_type(self, file_path): _, ext = os.path.splitext(file_path) ext = ext.lower() # Normalize to lowercase if ext == ".pdf": return "pdf" elif ext == ".docx": return "word" elif ext == ".odt": return "odt" elif ext == ".txt": return "txt" else: print("\next", ext) return "unknown" def load_odt_file(self, file_path: str): textdoc = load(file_path) all_paragraphs = textdoc.getElementsByType(P) text = [] for p in all_paragraphs: for node in p.childNodes: if node.nodeType == node.TEXT_NODE: text.append(node.data) return "\n".join(text) class Splitter_Simple: def __init__( self, chunk_size=1000, chunk_overlap=400, ): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) async def load_and_split_document(self, pdf_path: str): """Load PDF and split into chunks with metadata""" print("\nCOMEƇANDO LEITURA DO PDF") pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter) print("\nTERMINADO LEITURA DO PDF") return pages def load_and_split_text(self, text: str) -> List[Document]: documents: List[Document] = [] chunks = self.text_splitter.split_text(text) for chunk in chunks: documents.append(Document(page_content=chunk)) return documents def get_chunks_of_string_only_from_list_of_documents( self, lista_de_documentos: List[Document] ): full_text_as_string = "" for page in lista_de_documentos: full_text_as_string = full_text_as_string + page.page_content full_text_as_array = self.text_splitter.split_text(full_text_as_string) return full_text_as_array def combine_documents_without_losing_pagination(documents: list[Document]): combined_text = "" page_boundaries: List[Tuple[int, int, int]] = ( [] ) # (start_idx, end_idx, page_number) current_position = 0 for document in documents: start = current_position combined_text += document.page_content end = current_position + len(document.page_content) page_number = document.metadata.get("page", len(page_boundaries) + 1) page_boundaries.append((start, end, page_number)) current_position = end return page_boundaries, combined_text