Spaces:
Running
Running
| import os | |
| from typing import List, Tuple | |
| from langchain_core.documents import Document | |
| class SplitterUtils: | |
| def get_file_type(self, file_path): | |
| _, ext = os.path.splitext(file_path) | |
| ext = ext.lower() # Normalize to lowercase | |
| if ext == ".pdf": | |
| return "pdf" | |
| elif ext == ".docx": | |
| return "word" | |
| else: | |
| print("\next", ext) | |
| return "unknown" | |
| def combine_documents_without_losing_pagination(documents: list[Document]): | |
| combined_text = "" | |
| page_boundaries: List[Tuple[int, int, int]] = ( | |
| [] | |
| ) # (start_idx, end_idx, page_number) | |
| current_position = 0 | |
| for document in documents: | |
| start = current_position | |
| combined_text += document.page_content | |
| end = current_position + len(document.page_content) | |
| page_number = document.metadata.get("page", len(page_boundaries) + 1) | |
| page_boundaries.append((start, end, page_number)) | |
| current_position = end | |
| return page_boundaries, combined_text | |