File size: 1,041 Bytes
6e09bf4
c5586ab
 
 
 
6e09bf4
 
 
 
 
 
 
 
 
 
 
 
 
c5586ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os
from typing import List, Tuple
from langchain_core.documents import Document


class SplitterUtils:
    def get_file_type(self, file_path):
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()  # Normalize to lowercase
        if ext == ".pdf":
            return "pdf"
        elif ext == ".docx":
            return "word"
        else:
            print("\next", ext)
            return "unknown"


def combine_documents_without_losing_pagination(documents: list[Document]):
    combined_text = ""
    page_boundaries: List[Tuple[int, int, int]] = (
        []
    )  # (start_idx, end_idx, page_number)
    current_position = 0
    for document in documents:
        start = current_position
        combined_text += document.page_content
        end = current_position + len(document.page_content)
        page_number = document.metadata.get("page", len(page_boundaries) + 1)
        page_boundaries.append((start, end, page_number))

        current_position = end
    return page_boundaries, combined_text