luanpoppe commited on
Commit
01a4e83
·
1 Parent(s): 9cd1a8d

fix: ocr através da bubble

Browse files
_utils/langchain_utils/Splitter_class.py CHANGED
@@ -1,6 +1,9 @@
1
  import os
2
  import time
3
- from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
 
 
 
4
  from _utils.handle_files import return_document_list_with_llama_parser
5
  from _utils.langchain_utils.splitter_util import (
6
  Splitter_Simple,
@@ -30,6 +33,7 @@ from _utils.google_integration.google_cloud import (
30
  )
31
  from google.cloud import documentai
32
  from google.cloud import storage
 
33
 
34
 
35
  class Splitter:
@@ -160,6 +164,8 @@ class Splitter:
160
  print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
161
 
162
  if len(pages) == 0 or len(chunks) == 0:
 
 
163
  text = await self.getOCRFromGoogleDocumentAPI(pdf_path)
164
  chunks = self.load_and_split_text(text) # type: ignore
165
  chunks_of_string_only = [chunk.content for chunk in chunks]
 
1
  import os
2
  import time
3
+ from _utils.bubble_integrations.obter_arquivo import (
4
+ download_file_from_bubble,
5
+ get_pdf_from_bubble,
6
+ )
7
  from _utils.handle_files import return_document_list_with_llama_parser
8
  from _utils.langchain_utils.splitter_util import (
9
  Splitter_Simple,
 
33
  )
34
  from google.cloud import documentai
35
  from google.cloud import storage
36
+ from _utils.bubble_integrations.obter_arquivo import headers
37
 
38
 
39
  class Splitter:
 
164
  print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
165
 
166
  if len(pages) == 0 or len(chunks) == 0:
167
+ if isBubble:
168
+ pdf_path = download_file_from_bubble(pdf_path, headers, "pdf")
169
  text = await self.getOCRFromGoogleDocumentAPI(pdf_path)
170
  chunks = self.load_and_split_text(text) # type: ignore
171
  chunks_of_string_only = [chunk.content for chunk in chunks]