Spaces:
Running
Running
File size: 8,181 Bytes
cb23311 78209bc eebeb78 6e09bf4 78209bc 12d3e1a 6e09bf4 12d3e1a ab34606 12d3e1a 78209bc 12d3e1a cb23311 78209bc 32df555 12d3e1a ab34606 78209bc 0f952b3 6e09bf4 78209bc ab34606 78209bc ab34606 78209bc 0f952b3 78209bc ab34606 12d3e1a 78209bc 0f952b3 6e09bf4 f490f11 6e09bf4 78209bc ab34606 78209bc f490f11 ab34606 78209bc 12d3e1a 78209bc 12d3e1a 78209bc f490f11 12d3e1a ab34606 12d3e1a b374298 12d3e1a b374298 12d3e1a d32424b ab34606 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
from _utils.handle_files import return_document_list_with_llama_parser
from _utils.langchain_utils.splitter_util import (
SplitterUtils,
combine_documents_without_losing_pagination,
)
from setup.easy_imports import (
PyPDFLoader,
RecursiveCharacterTextSplitter,
Document,
Docx2txtLoader,
)
from typing import Any, List, Dict, Tuple, Optional, cast
from _utils.models.gerar_relatorio import (
DocumentChunk,
)
import uuid
splitter_utils = SplitterUtils()
class Splitter:
def __init__(
self,
chunk_size,
chunk_overlap,
):
self.splitter_simple = Splitter_Simple(chunk_size, chunk_overlap)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
self.chunk_metadata = {} # Store chunk metadata for tracing
async def load_and_split_document(
self, pdf_path: str, should_use_llama_parse: bool, isBubble: bool
):
"""Load PDF and split into chunks with metadata"""
# loader = PyPDFLoader(pdf_path)
# if not pages:
# pages = get_pdf_from_bubble(
# pdf_path
# ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
full_text_as_string = ""
chunks_of_string_only: List[str] = []
if isBubble:
print("\nPEGANDO PDF DO BUBBLE")
pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse) # type: ignore
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only
+ self.splitter_simple.get_chunks_of_string_only_from_list_of_documents(
pages
)
)
# for page in pages:
# full_text_as_string = full_text_as_string + page.page_content
# chunks_of_string_only = chunks_of_string_only + self.text_splitter.split_text(
# combined_text
# )
else:
if should_use_llama_parse:
print("\nENVIANDO PDFS PARA LLAMA PARSE")
pages = await return_document_list_with_llama_parser(pdf_path)
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only + self.text_splitter.split_text(combined_text)
)
else:
print("\nCOMEÇANDO LEITURA DO PDF")
file_extension = splitter_utils.get_file_type(pdf_path)
print("file_extension: ", file_extension)
if file_extension == "pdf":
pages = PyPDFLoader(pdf_path).load()
else:
pages = Docx2txtLoader(pdf_path).load()
print("TERMINOU LEITURA DO PDF")
print("pages: ", pages)
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only + self.text_splitter.split_text(combined_text)
)
chunks: List[DocumentChunk] = []
char_count = 0
# for page in pages:
# text = page.page_content
# page_chunks = self.text_splitter.split_text(
# text
# ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
text_char = 0
print("\nQUEBRANDO PDF EM CHUNKS ORGANIZADOS")
for chunk in chunks_of_string_only:
chunk_id = str(uuid.uuid4())
start_char = text_char + 1
end_char = start_char + len(chunk)
text_char = end_char
if should_use_llama_parse:
somar_pages = 0
else:
somar_pages = 1
page_number = 0
for start, end, page_number in page_boundaries:
if start <= start_char < end:
page_number = page_number
break
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
content=chunk,
contextual_summary="",
page_number=page_number + somar_pages, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
# char_count += len(text)
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
return chunks, chunks_of_string_only, full_text_as_string
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
page = Document(page_content=text, metadata={"page": 1})
chunks = []
char_count = 0
text = page.page_content
page_chunks = self.text_splitter.split_text(
text
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
print("\n\n\npage_chunks: ", page_chunks)
for chunk in page_chunks:
chunk_id = str(uuid.uuid4())
start_char = text.find(
chunk
) # Retorna a posição onde se encontra o chunk dentro da página inteira
end_char = start_char + len(chunk)
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
content=chunk,
page_number=cast(int, page.metadata.get("page"))
+ 1, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
char_count += len(text)
return chunks
class Splitter_Simple:
def __init__(
self,
chunk_size=1000,
chunk_overlap=400,
):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
async def load_and_split_document(self, pdf_path: str):
"""Load PDF and split into chunks with metadata"""
print("\nCOMEÇANDO LEITURA DO PDF")
pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter)
print("\nTERMINADO LEITURA DO PDF")
return pages
def load_and_split_text(self, text: str) -> List[Document]:
documents: List[Document] = []
chunks = self.text_splitter.split_text(text)
for chunk in chunks:
documents.append(Document(page_content=chunk))
return documents
def get_chunks_of_string_only_from_list_of_documents(
self, lista_de_documentos: List[Document]
):
full_text_as_string = ""
for page in lista_de_documentos:
full_text_as_string = full_text_as_string + page.page_content
full_text_as_array = self.text_splitter.split_text(full_text_as_string)
return full_text_as_array
|