Spaces:
Running
Running
File size: 7,436 Bytes
cb23311 78209bc eebeb78 5cb00b6 6e09bf4 5cb00b6 ab79998 6e09bf4 78209bc 588b95c 12d3e1a 5cb00b6 ab34606 12d3e1a 78209bc 12d3e1a cb23311 78209bc 12d3e1a ab34606 78209bc 0f952b3 6e09bf4 78209bc ab34606 78209bc 0f952b3 78209bc ab34606 12d3e1a 78209bc 0f952b3 5cb00b6 6e09bf4 ab79998 5cb00b6 668a7d5 3462a1d 4ef8d92 668a7d5 6e09bf4 f490f11 6e09bf4 78209bc ab34606 78209bc f490f11 ab34606 78209bc 12d3e1a 78209bc 12d3e1a 78209bc f490f11 12d3e1a 93c6cb3 12d3e1a b374298 12d3e1a b374298 12d3e1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
from _utils.handle_files import return_document_list_with_llama_parser
from _utils.langchain_utils.splitter_util import (
Splitter_Simple,
SplitterUtils,
combine_documents_without_losing_pagination,
)
from setup.easy_imports import (
PyPDFLoader,
RecursiveCharacterTextSplitter,
Document,
Docx2txtLoader,
TextLoader,
PyMuPDFLoader,
)
from typing import Any, List, Dict, Tuple, Optional, cast
from _utils.models.gerar_documento import (
DocumentChunk,
)
import uuid
class Splitter:
def __init__(
self,
chunk_size,
chunk_overlap,
):
self.splitter_util = SplitterUtils()
self.splitter_simple = Splitter_Simple(chunk_size, chunk_overlap)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
self.chunk_metadata = {} # Store chunk metadata for tracing
async def load_and_split_document(
self, pdf_path: str, should_use_llama_parse: bool, isBubble: bool
):
"""Load PDF and split into chunks with metadata"""
# loader = PyPDFLoader(pdf_path)
# if not pages:
# pages = get_pdf_from_bubble(
# pdf_path
# ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
chunks_of_string_only: List[str] = []
if isBubble:
print("\nPEGANDO PDF DO BUBBLE")
pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse) # type: ignore
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only
+ self.splitter_simple.get_chunks_of_string_only_from_list_of_documents(
pages
)
)
else:
if should_use_llama_parse:
print("\nENVIANDO PDFS PARA LLAMA PARSE")
pages = await return_document_list_with_llama_parser(pdf_path)
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only + self.text_splitter.split_text(combined_text)
)
else:
print("\nCOMEÇANDO LEITURA DO PDF")
file_extension = self.splitter_util.get_file_type(pdf_path)
print("file_extension: ", file_extension)
if file_extension == "pdf":
try:
pages = PyPDFLoader(pdf_path).load()
except:
pages = PyMuPDFLoader(pdf_path).load()
elif file_extension == "odt":
full_text = self.splitter_util.load_odt_file(pdf_path)
pages = self.splitter_simple.load_and_split_text(full_text)
elif file_extension == "txt":
pages = TextLoader(pdf_path).load()
elif file_extension == "doc":
# full_text_binary = textract.process(pdf_path)
full_text = self.splitter_util.getTextFromDotDoc(pdf_path)
pages = self.splitter_simple.load_and_split_text(full_text)
else:
pages = Docx2txtLoader(pdf_path).load()
print("TERMINOU LEITURA DO PDF")
print("pages: ", pages)
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only + self.text_splitter.split_text(combined_text)
)
chunks: List[DocumentChunk] = []
char_count = 0
# for page in pages:
# text = page.page_content
# page_chunks = self.text_splitter.split_text(
# text
# ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
text_char = 0
print("\nQUEBRANDO PDF EM CHUNKS ORGANIZADOS")
for chunk in chunks_of_string_only:
chunk_id = str(uuid.uuid4())
start_char = text_char + 1
end_char = start_char + len(chunk)
text_char = end_char
if should_use_llama_parse:
somar_pages = 0
else:
somar_pages = 1
page_number = 0
for start, end, page_number in page_boundaries:
if start <= start_char < end:
page_number = page_number
break
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
content=chunk,
contextual_summary="",
page_number=page_number + somar_pages, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
# char_count += len(text)
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
return chunks, chunks_of_string_only
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
page = Document(page_content=text, metadata={"page": 1})
chunks = []
char_count = 0
text = page.page_content
page_chunks = self.text_splitter.split_text(
text
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
print("\n\n\npage_chunks: ", page_chunks)
for chunk in page_chunks:
chunk_id = str(uuid.uuid4())
start_char = text.find(
chunk
) # Retorna a posição onde se encontra o chunk dentro da página inteira
end_char = start_char + len(chunk)
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
content=chunk,
page_number=cast(int, page.metadata.get("page"))
+ 1, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
char_count += len(text)
return chunks
|