Spaces:
Running
Running
luanpoppe
commited on
Commit
·
01a4e83
1
Parent(s):
9cd1a8d
fix: ocr através da bubble
Browse files
_utils/langchain_utils/Splitter_class.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1 |
import os
|
2 |
import time
|
3 |
-
from _utils.bubble_integrations.obter_arquivo import
|
|
|
|
|
|
|
4 |
from _utils.handle_files import return_document_list_with_llama_parser
|
5 |
from _utils.langchain_utils.splitter_util import (
|
6 |
Splitter_Simple,
|
@@ -30,6 +33,7 @@ from _utils.google_integration.google_cloud import (
|
|
30 |
)
|
31 |
from google.cloud import documentai
|
32 |
from google.cloud import storage
|
|
|
33 |
|
34 |
|
35 |
class Splitter:
|
@@ -160,6 +164,8 @@ class Splitter:
|
|
160 |
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
|
161 |
|
162 |
if len(pages) == 0 or len(chunks) == 0:
|
|
|
|
|
163 |
text = await self.getOCRFromGoogleDocumentAPI(pdf_path)
|
164 |
chunks = self.load_and_split_text(text) # type: ignore
|
165 |
chunks_of_string_only = [chunk.content for chunk in chunks]
|
|
|
1 |
import os
|
2 |
import time
|
3 |
+
from _utils.bubble_integrations.obter_arquivo import (
|
4 |
+
download_file_from_bubble,
|
5 |
+
get_pdf_from_bubble,
|
6 |
+
)
|
7 |
from _utils.handle_files import return_document_list_with_llama_parser
|
8 |
from _utils.langchain_utils.splitter_util import (
|
9 |
Splitter_Simple,
|
|
|
33 |
)
|
34 |
from google.cloud import documentai
|
35 |
from google.cloud import storage
|
36 |
+
from _utils.bubble_integrations.obter_arquivo import headers
|
37 |
|
38 |
|
39 |
class Splitter:
|
|
|
164 |
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
|
165 |
|
166 |
if len(pages) == 0 or len(chunks) == 0:
|
167 |
+
if isBubble:
|
168 |
+
pdf_path = download_file_from_bubble(pdf_path, headers, "pdf")
|
169 |
text = await self.getOCRFromGoogleDocumentAPI(pdf_path)
|
170 |
chunks = self.load_and_split_text(text) # type: ignore
|
171 |
chunks_of_string_only = [chunk.content for chunk in chunks]
|