Spaces:
Running
Running
luanpoppe
commited on
Commit
·
ab79998
1
Parent(s):
64ed488
fix: adicionando mais uma possibilidade de carregar o pdf caso dê erro com o pypdf
Browse files
_utils/bubble_integrations/obter_arquivo.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1 |
# from setup.easy_imports import PyPDFLoader
|
2 |
import os
|
3 |
-
from langchain_community.document_loaders import
|
|
|
|
|
|
|
|
|
|
|
4 |
import tempfile
|
5 |
import requests
|
6 |
|
@@ -37,7 +42,10 @@ async def get_pdf_from_bubble(
|
|
37 |
else:
|
38 |
extension = file_url.split(".")[-1]
|
39 |
if extension.lower() == "pdf":
|
40 |
-
|
|
|
|
|
|
|
41 |
elif extension.lower() == "odt":
|
42 |
temp_path = download_file_from_bubble(file_url, headers, ".odt")
|
43 |
full_text = splitter_utils.load_odt_file(temp_path)
|
|
|
1 |
# from setup.easy_imports import PyPDFLoader
|
2 |
import os
|
3 |
+
from langchain_community.document_loaders import (
|
4 |
+
PyPDFLoader,
|
5 |
+
Docx2txtLoader,
|
6 |
+
TextLoader,
|
7 |
+
PyMuPDFLoader,
|
8 |
+
)
|
9 |
import tempfile
|
10 |
import requests
|
11 |
|
|
|
42 |
else:
|
43 |
extension = file_url.split(".")[-1]
|
44 |
if extension.lower() == "pdf":
|
45 |
+
try:
|
46 |
+
result = PyPDFLoader(file_url, headers=headers).load()
|
47 |
+
except:
|
48 |
+
result = PyMuPDFLoader(file_url, headers=headers).load()
|
49 |
elif extension.lower() == "odt":
|
50 |
temp_path = download_file_from_bubble(file_url, headers, ".odt")
|
51 |
full_text = splitter_utils.load_odt_file(temp_path)
|
_utils/bubble_integrations/tests/__init__.py
ADDED
File without changes
|
_utils/bubble_integrations/tests/test_obter_arquivo.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
from _utils.bubble_integrations.obter_arquivo import (
|
3 |
+
get_pdf_from_bubble,
|
4 |
+
)
|
5 |
+
from langchain_core.documents import Document
|
6 |
+
from _utils.gerar_documento_utils.contextual_retriever import (
|
7 |
+
ContextualRetriever,
|
8 |
+
ContextualRetrieverUtils,
|
9 |
+
)
|
10 |
+
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
11 |
+
from _utils.models.gerar_documento import RetrievalConfig
|
12 |
+
from _utils.models.gerar_documento import (
|
13 |
+
ContextualizedChunk,
|
14 |
+
DocumentChunk,
|
15 |
+
RetrievalConfig,
|
16 |
+
)
|
17 |
+
from gerar_documento.serializer import (
|
18 |
+
GerarDocumentoComPDFProprioSerializer,
|
19 |
+
GerarDocumentoComPDFProprioSerializerData,
|
20 |
+
)
|
21 |
+
from setup.logging import Axiom
|
22 |
+
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
|
23 |
+
|
24 |
+
|
25 |
+
class TestObterArquivo:
|
26 |
+
@pytest.mark.asyncio
|
27 |
+
async def test_get_pdf_from_bubble(self):
|
28 |
+
file_url = "https://vella.app.br/version-5265j/fileupload/f1745094959601x803512841326306200/000_I%CC%81ntegra%20%200015348-89.2020.4.06.0981.pdf"
|
29 |
+
|
30 |
+
response = await get_pdf_from_bubble(file_url, False)
|
31 |
+
assert len(response) > 0
|
32 |
+
for page in response:
|
33 |
+
assert isinstance(page, Document)
|
_utils/langchain_utils/Splitter_class.py
CHANGED
@@ -11,6 +11,7 @@ from setup.easy_imports import (
|
|
11 |
Document,
|
12 |
Docx2txtLoader,
|
13 |
TextLoader,
|
|
|
14 |
)
|
15 |
from typing import Any, List, Dict, Tuple, Optional, cast
|
16 |
from _utils.models.gerar_documento import (
|
@@ -71,7 +72,10 @@ class Splitter:
|
|
71 |
file_extension = self.splitter_util.get_file_type(pdf_path)
|
72 |
print("file_extension: ", file_extension)
|
73 |
if file_extension == "pdf":
|
74 |
-
|
|
|
|
|
|
|
75 |
elif file_extension == "odt":
|
76 |
full_text = self.splitter_util.load_odt_file(pdf_path)
|
77 |
pages = self.splitter_simple.load_and_split_text(full_text)
|
|
|
11 |
Document,
|
12 |
Docx2txtLoader,
|
13 |
TextLoader,
|
14 |
+
PyMuPDFLoader,
|
15 |
)
|
16 |
from typing import Any, List, Dict, Tuple, Optional, cast
|
17 |
from _utils.models.gerar_documento import (
|
|
|
72 |
file_extension = self.splitter_util.get_file_type(pdf_path)
|
73 |
print("file_extension: ", file_extension)
|
74 |
if file_extension == "pdf":
|
75 |
+
try:
|
76 |
+
pages = PyPDFLoader(pdf_path).load()
|
77 |
+
except:
|
78 |
+
pages = PyMuPDFLoader(pdf_path).load()
|
79 |
elif file_extension == "odt":
|
80 |
full_text = self.splitter_util.load_odt_file(pdf_path)
|
81 |
pages = self.splitter_simple.load_and_split_text(full_text)
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
setup/easy_imports.py
CHANGED
@@ -13,7 +13,12 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
from langchain_core.runnables import RunnablePassthrough
|
15 |
from langchain_core.prompts import ChatPromptTemplate
|
16 |
-
from langchain_community.document_loaders import
|
|
|
|
|
|
|
|
|
|
|
17 |
from langchain_community.vectorstores import Chroma
|
18 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
19 |
|
|
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
from langchain_core.runnables import RunnablePassthrough
|
15 |
from langchain_core.prompts import ChatPromptTemplate
|
16 |
+
from langchain_community.document_loaders import (
|
17 |
+
PyPDFLoader,
|
18 |
+
Docx2txtLoader,
|
19 |
+
TextLoader,
|
20 |
+
PyMuPDFLoader,
|
21 |
+
)
|
22 |
from langchain_community.vectorstores import Chroma
|
23 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
24 |
|