luanpoppe commited on
Commit
ab79998
·
1 Parent(s): 64ed488

fix: adicionando mais uma possibilidade de carregar o pdf caso dê erro com o pypdf

Browse files
_utils/bubble_integrations/obter_arquivo.py CHANGED
@@ -1,6 +1,11 @@
1
  # from setup.easy_imports import PyPDFLoader
2
  import os
3
- from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
 
 
 
 
 
4
  import tempfile
5
  import requests
6
 
@@ -37,7 +42,10 @@ async def get_pdf_from_bubble(
37
  else:
38
  extension = file_url.split(".")[-1]
39
  if extension.lower() == "pdf":
40
- result = PyPDFLoader(file_url, headers=headers).load()
 
 
 
41
  elif extension.lower() == "odt":
42
  temp_path = download_file_from_bubble(file_url, headers, ".odt")
43
  full_text = splitter_utils.load_odt_file(temp_path)
 
1
  # from setup.easy_imports import PyPDFLoader
2
  import os
3
+ from langchain_community.document_loaders import (
4
+ PyPDFLoader,
5
+ Docx2txtLoader,
6
+ TextLoader,
7
+ PyMuPDFLoader,
8
+ )
9
  import tempfile
10
  import requests
11
 
 
42
  else:
43
  extension = file_url.split(".")[-1]
44
  if extension.lower() == "pdf":
45
+ try:
46
+ result = PyPDFLoader(file_url, headers=headers).load()
47
+ except:
48
+ result = PyMuPDFLoader(file_url, headers=headers).load()
49
  elif extension.lower() == "odt":
50
  temp_path = download_file_from_bubble(file_url, headers, ".odt")
51
  full_text = splitter_utils.load_odt_file(temp_path)
_utils/bubble_integrations/tests/__init__.py ADDED
File without changes
_utils/bubble_integrations/tests/test_obter_arquivo.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from _utils.bubble_integrations.obter_arquivo import (
3
+ get_pdf_from_bubble,
4
+ )
5
+ from langchain_core.documents import Document
6
+ from _utils.gerar_documento_utils.contextual_retriever import (
7
+ ContextualRetriever,
8
+ ContextualRetrieverUtils,
9
+ )
10
+ from _utils.gerar_documento_utils.llm_calls import agemini_answer
11
+ from _utils.models.gerar_documento import RetrievalConfig
12
+ from _utils.models.gerar_documento import (
13
+ ContextualizedChunk,
14
+ DocumentChunk,
15
+ RetrievalConfig,
16
+ )
17
+ from gerar_documento.serializer import (
18
+ GerarDocumentoComPDFProprioSerializer,
19
+ GerarDocumentoComPDFProprioSerializerData,
20
+ )
21
+ from setup.logging import Axiom
22
+ from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
23
+
24
+
25
+ class TestObterArquivo:
26
+ @pytest.mark.asyncio
27
+ async def test_get_pdf_from_bubble(self):
28
+ file_url = "https://vella.app.br/version-5265j/fileupload/f1745094959601x803512841326306200/000_I%CC%81ntegra%20%200015348-89.2020.4.06.0981.pdf"
29
+
30
+ response = await get_pdf_from_bubble(file_url, False)
31
+ assert len(response) > 0
32
+ for page in response:
33
+ assert isinstance(page, Document)
_utils/langchain_utils/Splitter_class.py CHANGED
@@ -11,6 +11,7 @@ from setup.easy_imports import (
11
  Document,
12
  Docx2txtLoader,
13
  TextLoader,
 
14
  )
15
  from typing import Any, List, Dict, Tuple, Optional, cast
16
  from _utils.models.gerar_documento import (
@@ -71,7 +72,10 @@ class Splitter:
71
  file_extension = self.splitter_util.get_file_type(pdf_path)
72
  print("file_extension: ", file_extension)
73
  if file_extension == "pdf":
74
- pages = PyPDFLoader(pdf_path).load()
 
 
 
75
  elif file_extension == "odt":
76
  full_text = self.splitter_util.load_odt_file(pdf_path)
77
  pages = self.splitter_simple.load_and_split_text(full_text)
 
11
  Document,
12
  Docx2txtLoader,
13
  TextLoader,
14
+ PyMuPDFLoader,
15
  )
16
  from typing import Any, List, Dict, Tuple, Optional, cast
17
  from _utils.models.gerar_documento import (
 
72
  file_extension = self.splitter_util.get_file_type(pdf_path)
73
  print("file_extension: ", file_extension)
74
  if file_extension == "pdf":
75
+ try:
76
+ pages = PyPDFLoader(pdf_path).load()
77
+ except:
78
+ pages = PyMuPDFLoader(pdf_path).load()
79
  elif file_extension == "odt":
80
  full_text = self.splitter_util.load_odt_file(pdf_path)
81
  pages = self.splitter_simple.load_and_split_text(full_text)
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
setup/easy_imports.py CHANGED
@@ -13,7 +13,12 @@ from langchain_huggingface import HuggingFaceEmbeddings
13
  from langchain.prompts import PromptTemplate
14
  from langchain_core.runnables import RunnablePassthrough
15
  from langchain_core.prompts import ChatPromptTemplate
16
- from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
 
 
 
 
 
17
  from langchain_community.vectorstores import Chroma
18
  from langchain_google_genai import ChatGoogleGenerativeAI
19
 
 
13
  from langchain.prompts import PromptTemplate
14
  from langchain_core.runnables import RunnablePassthrough
15
  from langchain_core.prompts import ChatPromptTemplate
16
+ from langchain_community.document_loaders import (
17
+ PyPDFLoader,
18
+ Docx2txtLoader,
19
+ TextLoader,
20
+ PyMuPDFLoader,
21
+ )
22
  from langchain_community.vectorstores import Chroma
23
  from langchain_google_genai import ChatGoogleGenerativeAI
24