luanpoppe commited on
Commit
6e09bf4
·
1 Parent(s): d8410b4

feat: adicionando suporte a arquivos do word

Browse files
_utils/bubble_integrations/obter_arquivo.py CHANGED
@@ -1,11 +1,13 @@
1
  # from setup.easy_imports import PyPDFLoader
2
  import os
3
- from langchain_community.document_loaders import PyPDFLoader
4
  import tempfile
5
  import requests
6
 
7
  from _utils.handle_files import return_document_list_with_llama_parser
 
8
 
 
9
  headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
10
 
11
 
@@ -32,6 +34,23 @@ async def get_pdf_from_bubble(
32
  tmp_file.name
33
  ) # por enquanto este arquivo não está sendo excluído
34
  else:
35
- result = PyPDFLoader(file_url, headers=headers)
 
 
 
 
 
36
 
37
  return result.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # from setup.easy_imports import PyPDFLoader
2
  import os
3
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
4
  import tempfile
5
  import requests
6
 
7
  from _utils.handle_files import return_document_list_with_llama_parser
8
+ from _utils.splitters.splitter_util import SplitterUtils
9
 
10
+ splitter_utils = SplitterUtils()
11
  headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
12
 
13
 
 
34
  tmp_file.name
35
  ) # por enquanto este arquivo não está sendo excluído
36
  else:
37
+ extension = file_url.split(".")[-1]
38
+ if extension.lower() == "pdf":
39
+ result = PyPDFLoader(file_url, headers=headers)
40
+ else:
41
+ temp_path = download_docx(file_url, headers)
42
+ result = Docx2txtLoader(temp_path)
43
 
44
  return result.load()
45
+
46
+
47
+ def download_docx(url, headers):
48
+ response = requests.get(url, headers=headers)
49
+ response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx)
50
+
51
+ # Save the downloaded file into a temporary file
52
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
53
+ with open(temp_file.name, "wb") as f:
54
+ f.write(response.content) # por enquanto este arquivo não está sendo excluído
55
+
56
+ return temp_file.name
_utils/handle_files.py CHANGED
@@ -4,6 +4,8 @@ from langchain_core.documents import Document as LangchainDocument
4
  from llama_index import Document
5
  from llama_parse import LlamaParse, ResultType
6
 
 
 
7
  llama_parser_keys = [
8
  os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
9
  os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
@@ -13,9 +15,10 @@ llama_parser_keys = [
13
  def handle_pdf_files_from_serializer(files):
14
  listaPDFs = []
15
  for file in files:
 
16
  file.seek(0)
17
  with tempfile.NamedTemporaryFile(
18
- delete=False, suffix=".pdf"
19
  ) as temp_file: # Create a temporary file to save the uploaded PDF
20
  for (
21
  chunk
 
4
  from llama_index import Document
5
  from llama_parse import LlamaParse, ResultType
6
 
7
+ from _utils.splitters.splitter_util import SplitterUtils
8
+
9
  llama_parser_keys = [
10
  os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
11
  os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
 
15
  def handle_pdf_files_from_serializer(files):
16
  listaPDFs = []
17
  for file in files:
18
+ file_extension = file.name.split(".")[-1]
19
  file.seek(0)
20
  with tempfile.NamedTemporaryFile(
21
+ delete=False, suffix=f".{file_extension}"
22
  ) as temp_file: # Create a temporary file to save the uploaded PDF
23
  for (
24
  chunk
_utils/splitters/Splitter_class.py CHANGED
@@ -1,13 +1,23 @@
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
2
  from _utils.handle_files import return_document_list_with_llama_parser
3
- from _utils.splitters.splitter_util import combine_documents_without_losing_pagination
4
- from setup.easy_imports import PyPDFLoader, RecursiveCharacterTextSplitter, Document
 
 
 
 
 
 
 
 
5
  from typing import Any, List, Dict, Tuple, Optional, cast
6
  from _utils.models.gerar_relatorio import (
7
  DocumentChunk,
8
  )
9
  import uuid
10
 
 
 
11
 
12
  class Splitter:
13
  def __init__(
@@ -34,7 +44,7 @@ class Splitter:
34
 
35
  if isBubble:
36
  print("\nPEGANDO PDF DO BUBBLE")
37
- pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
38
  page_boundaries, combined_text = (
39
  combine_documents_without_losing_pagination(pages)
40
  )
@@ -53,8 +63,14 @@ class Splitter:
53
  )
54
  else:
55
  print("\nCOMEÇANDO LEITURA DO PDF")
56
- pages = PyPDFLoader(pdf_path).load()
 
 
 
 
 
57
  print("TERMINOU LEITURA DO PDF")
 
58
  page_boundaries, combined_text = (
59
  combine_documents_without_losing_pagination(pages)
60
  )
 
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
2
  from _utils.handle_files import return_document_list_with_llama_parser
3
+ from _utils.splitters.splitter_util import (
4
+ SplitterUtils,
5
+ combine_documents_without_losing_pagination,
6
+ )
7
+ from setup.easy_imports import (
8
+ PyPDFLoader,
9
+ RecursiveCharacterTextSplitter,
10
+ Document,
11
+ Docx2txtLoader,
12
+ )
13
  from typing import Any, List, Dict, Tuple, Optional, cast
14
  from _utils.models.gerar_relatorio import (
15
  DocumentChunk,
16
  )
17
  import uuid
18
 
19
+ splitter_utils = SplitterUtils()
20
+
21
 
22
  class Splitter:
23
  def __init__(
 
44
 
45
  if isBubble:
46
  print("\nPEGANDO PDF DO BUBBLE")
47
+ pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse) # type: ignore
48
  page_boundaries, combined_text = (
49
  combine_documents_without_losing_pagination(pages)
50
  )
 
63
  )
64
  else:
65
  print("\nCOMEÇANDO LEITURA DO PDF")
66
+ file_extension = splitter_utils.get_file_type(pdf_path)
67
+ print("file_extension: ", file_extension)
68
+ if file_extension == "pdf":
69
+ pages = PyPDFLoader(pdf_path).load()
70
+ else:
71
+ pages = Docx2txtLoader(pdf_path).load()
72
  print("TERMINOU LEITURA DO PDF")
73
+ print("pages: ", pages)
74
  page_boundaries, combined_text = (
75
  combine_documents_without_losing_pagination(pages)
76
  )
_utils/splitters/splitter_util.py CHANGED
@@ -1,7 +1,21 @@
 
1
  from typing import List, Tuple
2
  from langchain_core.documents import Document
3
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  def combine_documents_without_losing_pagination(documents: list[Document]):
6
  combined_text = ""
7
  page_boundaries: List[Tuple[int, int, int]] = (
 
1
+ import os
2
  from typing import List, Tuple
3
  from langchain_core.documents import Document
4
 
5
 
6
+ class SplitterUtils:
7
+ def get_file_type(self, file_path):
8
+ _, ext = os.path.splitext(file_path)
9
+ ext = ext.lower() # Normalize to lowercase
10
+ if ext == ".pdf":
11
+ return "pdf"
12
+ elif ext == ".docx":
13
+ return "word"
14
+ else:
15
+ print("\next", ext)
16
+ return "unknown"
17
+
18
+
19
  def combine_documents_without_losing_pagination(documents: list[Document]):
20
  combined_text = ""
21
  page_boundaries: List[Tuple[int, int, int]] = (
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
setup/easy_imports.py CHANGED
@@ -12,7 +12,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
12
  # from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from langchain.prompts import PromptTemplate
14
  from langchain_core.prompts import ChatPromptTemplate
15
- from langchain_community.document_loaders import PyPDFLoader
16
  from langchain_community.vectorstores import Chroma
17
  from langchain_google_genai import ChatGoogleGenerativeAI
18
 
 
12
  # from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from langchain.prompts import PromptTemplate
14
  from langchain_core.prompts import ChatPromptTemplate
15
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
16
  from langchain_community.vectorstores import Chroma
17
  from langchain_google_genai import ChatGoogleGenerativeAI
18