Spaces:
Running
Running
File size: 14,203 Bytes
edd5b40 cb23311 78209bc eebeb78 5cb00b6 6e09bf4 5cb00b6 ab79998 6e09bf4 78209bc 588b95c 12d3e1a edd5b40 12d3e1a 5cb00b6 ab34606 12d3e1a 78209bc edd5b40 78209bc 12d3e1a cb23311 78209bc 12d3e1a ab34606 78209bc 0f952b3 6e09bf4 78209bc ab34606 78209bc 0f952b3 78209bc ab34606 12d3e1a 78209bc 0f952b3 5cb00b6 6e09bf4 ab79998 5cb00b6 668a7d5 3462a1d 4ef8d92 668a7d5 6e09bf4 f490f11 6e09bf4 78209bc ab34606 78209bc f490f11 ab34606 78209bc 12d3e1a 78209bc 12d3e1a 78209bc f490f11 12d3e1a edd5b40 93c6cb3 12d3e1a b374298 12d3e1a b374298 12d3e1a edd5b40 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 |
import os
import time
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
from _utils.handle_files import return_document_list_with_llama_parser
from _utils.langchain_utils.splitter_util import (
Splitter_Simple,
SplitterUtils,
combine_documents_without_losing_pagination,
)
from setup.easy_imports import (
PyPDFLoader,
RecursiveCharacterTextSplitter,
Document,
Docx2txtLoader,
TextLoader,
PyMuPDFLoader,
)
from typing import Any, List, Dict, Tuple, Optional, cast
from _utils.models.gerar_documento import (
DocumentChunk,
)
import uuid
import json
from _utils.google_integration.google_cloud import (
DOCUMENT_API_ID,
GCP_PROJECT,
GCP_REGION,
GCS_BUCKET_NAME,
upload_to_gcs,
)
from google.cloud import documentai
from google.cloud import storage
class Splitter:
def __init__(
self,
chunk_size,
chunk_overlap,
):
self.splitter_util = SplitterUtils()
self.splitter_simple = Splitter_Simple(chunk_size, chunk_overlap)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
self.chunk_metadata = {} # Store chunk metadata for tracing
async def load_and_split_document(
self,
pdf_path: str,
should_use_llama_parse: bool,
isBubble: bool,
):
"""Load PDF and split into chunks with metadata"""
# loader = PyPDFLoader(pdf_path)
# if not pages:
# pages = get_pdf_from_bubble(
# pdf_path
# ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
chunks_of_string_only: List[str] = []
if isBubble:
print("\nPEGANDO PDF DO BUBBLE")
pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse) # type: ignore
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only
+ self.splitter_simple.get_chunks_of_string_only_from_list_of_documents(
pages
)
)
else:
if should_use_llama_parse:
print("\nENVIANDO PDFS PARA LLAMA PARSE")
pages = await return_document_list_with_llama_parser(pdf_path)
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only + self.text_splitter.split_text(combined_text)
)
else:
print("\nCOMEÇANDO LEITURA DO PDF")
file_extension = self.splitter_util.get_file_type(pdf_path)
print("file_extension: ", file_extension)
if file_extension == "pdf":
try:
pages = PyPDFLoader(pdf_path).load()
except:
pages = PyMuPDFLoader(pdf_path).load()
elif file_extension == "odt":
full_text = self.splitter_util.load_odt_file(pdf_path)
pages = self.splitter_simple.load_and_split_text(full_text)
elif file_extension == "txt":
pages = TextLoader(pdf_path).load()
elif file_extension == "doc":
# full_text_binary = textract.process(pdf_path)
full_text = self.splitter_util.getTextFromDotDoc(pdf_path)
pages = self.splitter_simple.load_and_split_text(full_text)
else:
pages = Docx2txtLoader(pdf_path).load()
print("TERMINOU LEITURA DO PDF")
print("pages: ", pages)
page_boundaries, combined_text = (
combine_documents_without_losing_pagination(pages)
)
chunks_of_string_only = (
chunks_of_string_only + self.text_splitter.split_text(combined_text)
)
chunks: List[DocumentChunk] = []
char_count = 0
# for page in pages:
# text = page.page_content
# page_chunks = self.text_splitter.split_text(
# text
# ) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
text_char = 0
print("\nQUEBRANDO PDF EM CHUNKS ORGANIZADOS")
for chunk in chunks_of_string_only:
chunk_id = str(uuid.uuid4())
start_char = text_char + 1
end_char = start_char + len(chunk)
text_char = end_char
if should_use_llama_parse:
somar_pages = 0
else:
somar_pages = 1
page_number = 0
for start, end, page_number in page_boundaries:
if start <= start_char < end:
page_number = page_number
break
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
content=chunk,
contextual_summary="",
page_number=page_number + somar_pages, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
# char_count += len(text)
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
if len(pages) == 0 or len(chunks) == 0:
text = await self.getOCRFromGoogleDocumentAPI(pdf_path)
chunks = self.load_and_split_text(text) # type: ignore
chunks_of_string_only = [chunk.content for chunk in chunks]
return chunks, chunks_of_string_only
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
page = Document(page_content=text, metadata={"page": 1})
chunks = []
char_count = 0
text = page.page_content
page_chunks = self.text_splitter.split_text(
text
) # Quebra o item que é um Document de UMA PÁGINA inteira em um lista onde cada item é referente a um chunk, que são pedaços menores do que uma página.
print("\n\n\npage_chunks: ", page_chunks)
for chunk in page_chunks:
chunk_id = str(uuid.uuid4())
start_char = text.find(
chunk
) # Retorna a posição onde se encontra o chunk dentro da página inteira
end_char = start_char + len(chunk)
doc_chunk = DocumentChunk( # Gera o objeto do chunk com informações adicionais, como a posição e id do chunk
content=chunk,
page_number=cast(int, page.metadata.get("page"))
+ 1, # 1-based page numbering
chunk_id=chunk_id,
start_char=char_count + start_char,
end_char=char_count + end_char,
)
chunks.append(doc_chunk)
# Store metadata for later retrieval
self.chunk_metadata[chunk_id] = {
"page": doc_chunk.page_number,
"start_char": doc_chunk.start_char,
"end_char": doc_chunk.end_char,
}
char_count += len(text)
return chunks
async def getOCRFromGoogleDocumentAPI(self, pdf_path: str):
pdf_gcs_uri = upload_to_gcs(pdf_path)
GCS_OUTPUT_PREFIX = "documentai_output/"
# GCS_INPUT_URI = f"gs://{GCS_BUCKET_NAME}/{f"gemini_uploads/{pdf_gcs_uri}"}"
GCS_INPUT_URI = pdf_gcs_uri
GCS_OUTPUT_URI = f"gs://{GCS_BUCKET_NAME}/{GCS_OUTPUT_PREFIX}"
docai_client = documentai.DocumentProcessorServiceClient()
processor_name = docai_client.processor_path(
project=GCP_PROJECT, location="us", processor=DOCUMENT_API_ID
)
gcs_document = documentai.GcsDocument(
gcs_uri=GCS_INPUT_URI,
mime_type="application/pdf", # Mime type is specified here for GcsDocument
)
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
# 3. Create the BatchDocumentsInputConfig
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
# Note: If GCS_INPUT_URI was a prefix for multiple files, you'd use GcsPrefix:
# gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=GCS_INPUT_URI_PREFIX)
# input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix, mime_type="application/pdf")
# 4. Create the DocumentOutputConfig
# GCS_OUTPUT_URI should be a gs:// URI prefix where the output JSONs will be stored
output_config = documentai.DocumentOutputConfig(
gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=GCS_OUTPUT_URI
)
)
# 5. Construct the BatchProcessRequest
request = documentai.BatchProcessRequest(
name=processor_name,
input_documents=input_config, # Use 'input_documents'
document_output_config=output_config, # Use 'document_output_config'
)
# Submit the batch process request (this is a long-running operation)
operation = docai_client.batch_process_documents(request)
print("Batch processing operation started. Waiting for completion...")
while not operation.done():
time.sleep(15) # Wait for 30 seconds before checking again
print("Waiting...")
print("Batch processing operation finished.")
# --- Download the results from GCS ---
storage_client = storage.Client(
project=GCP_PROJECT
) # Uses GOOGLE_APPLICATION_CREDENTIALS/ADC
bucket = storage_client.bucket(GCS_BUCKET_NAME)
output_blobs = storage_client.list_blobs(
GCS_BUCKET_NAME, prefix=GCS_OUTPUT_PREFIX
)
downloaded_files_texts = []
try:
for blob in output_blobs:
# Document AI adds suffixes and subdirectories. Look for the actual JSON output files.
# The exact naming depends on the processor and options. Common pattern is ending with .json
if blob.name.endswith(".json"):
local_download_path = os.path.basename(
blob.name
) # Download to current directory with blob name
print(f"Downloading {blob.name} to {local_download_path}...")
blob.download_to_filename(local_download_path)
with open(local_download_path, "r", encoding="utf-8") as f:
document_data = json.load(f)
# The top-level 'text' field contains the concatenated plain text.
if "text" in document_data and document_data["text"] is not None:
raw_text = document_data["text"]
print(f"\n--- Raw Text Extracted from {blob.name} ---")
# Print only a snippet or process as needed
print(
raw_text[:1000] + "..."
if len(raw_text) > 1000
else raw_text
)
print("--------------------------------------------")
return raw_text
# Optional: Store the text. If you processed a batch of files,
# you might want to associate the text with the original file name.
# Document AI metadata might link output JSONs back to input files.
# For simplicity here, let's just show the extraction.
# If you know it was a single input PDF, this is all the text.
# If it was multiple, you'd need a mapping or process each JSON.
else:
print(
f"Warning: 'text' field not found in {blob.name} or is empty."
)
# Optional: Read and print a snippet of the JSON content
# with open(local_download_path, 'r', encoding='utf-8') as f:
# data = json.load(f)
# # Print some extracted text, for example (structure varies by processor)
# if 'text' in data:
# print(f"Extracted text snippet: {data['text'][:500]}...") # Print first 500 chars
# elif 'entities' in data:
# print(f"Number of entities found: {len(data['entities'])}")
# else:
# print("Output JSON structure not immediately recognizable.")
# break # Uncomment if you only expect/need to process the first output file
if len(downloaded_files_texts) == 0 or not downloaded_files_texts:
print("No JSON output files found in the specified output location.")
except Exception as e:
print(f"Error listing or downloading output files: {e}")
print("\nProcess complete.")
if downloaded_files_texts:
print(f"Downloaded output file(s): {', '.join(downloaded_files_texts)}")
print("These files contain the OCR results in JSON format.")
else:
print("No output files were successfully downloaded.")
|