import os from typing import List, Tuple from langchain_core.documents import Document class SplitterUtils: def get_file_type(self, file_path): _, ext = os.path.splitext(file_path) ext = ext.lower() # Normalize to lowercase if ext == ".pdf": return "pdf" elif ext == ".docx": return "word" else: print("\next", ext) return "unknown" def combine_documents_without_losing_pagination(documents: list[Document]): combined_text = "" page_boundaries: List[Tuple[int, int, int]] = ( [] ) # (start_idx, end_idx, page_number) current_position = 0 for document in documents: start = current_position combined_text += document.page_content end = current_position + len(document.page_content) page_number = document.metadata.get("page", len(page_boundaries) + 1) page_boundaries.append((start, end, page_number)) current_position = end return page_boundaries, combined_text