vella-backend / _utils /splitters /splitter_util.py
luanpoppe
feat: adicionando suporte a arquivos do word
6e09bf4
raw
history blame
1.04 kB
import os
from typing import List, Tuple
from langchain_core.documents import Document
class SplitterUtils:
def get_file_type(self, file_path):
_, ext = os.path.splitext(file_path)
ext = ext.lower() # Normalize to lowercase
if ext == ".pdf":
return "pdf"
elif ext == ".docx":
return "word"
else:
print("\next", ext)
return "unknown"
def combine_documents_without_losing_pagination(documents: list[Document]):
combined_text = ""
page_boundaries: List[Tuple[int, int, int]] = (
[]
) # (start_idx, end_idx, page_number)
current_position = 0
for document in documents:
start = current_position
combined_text += document.page_content
end = current_position + len(document.page_content)
page_number = document.metadata.get("page", len(page_boundaries) + 1)
page_boundaries.append((start, end, page_number))
current_position = end
return page_boundaries, combined_text