Spaces:
Running
Running
import os | |
from typing import List, Tuple | |
from langchain_core.documents import Document | |
class SplitterUtils: | |
def get_file_type(self, file_path): | |
_, ext = os.path.splitext(file_path) | |
ext = ext.lower() # Normalize to lowercase | |
if ext == ".pdf": | |
return "pdf" | |
elif ext == ".docx": | |
return "word" | |
else: | |
print("\next", ext) | |
return "unknown" | |
def combine_documents_without_losing_pagination(documents: list[Document]): | |
combined_text = "" | |
page_boundaries: List[Tuple[int, int, int]] = ( | |
[] | |
) # (start_idx, end_idx, page_number) | |
current_position = 0 | |
for document in documents: | |
start = current_position | |
combined_text += document.page_content | |
end = current_position + len(document.page_content) | |
page_number = document.metadata.get("page", len(page_boundaries) + 1) | |
page_boundaries.append((start, end, page_number)) | |
current_position = end | |
return page_boundaries, combined_text | |