luanpoppe commited on
Commit
5cb00b6
·
1 Parent(s): 09a8a72

feat: adicionado suporte para .odt e .txt

Browse files
_utils/bubble_integrations/obter_arquivo.py CHANGED
@@ -1,13 +1,14 @@
1
  # from setup.easy_imports import PyPDFLoader
2
  import os
3
- from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
4
  import tempfile
5
  import requests
6
 
7
  from _utils.handle_files import return_document_list_with_llama_parser
8
- from _utils.langchain_utils.splitter_util import SplitterUtils
9
 
10
  splitter_utils = SplitterUtils()
 
11
  headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
12
 
13
 
@@ -36,20 +37,27 @@ async def get_pdf_from_bubble(
36
  else:
37
  extension = file_url.split(".")[-1]
38
  if extension.lower() == "pdf":
39
- result = PyPDFLoader(file_url, headers=headers)
 
 
 
 
 
 
 
40
  else:
41
- temp_path = download_docx(file_url, headers)
42
- result = Docx2txtLoader(temp_path)
43
 
44
- return result.load()
45
 
46
 
47
- def download_docx(url, headers):
48
  response = requests.get(url, headers=headers)
49
  response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx)
50
 
51
  # Save the downloaded file into a temporary file
52
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
53
  with open(temp_file.name, "wb") as f:
54
  f.write(response.content) # por enquanto este arquivo não está sendo excluído
55
 
 
1
  # from setup.easy_imports import PyPDFLoader
2
  import os
3
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
4
  import tempfile
5
  import requests
6
 
7
  from _utils.handle_files import return_document_list_with_llama_parser
8
+ from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
9
 
10
  splitter_utils = SplitterUtils()
11
+ splitter_simple = Splitter_Simple()
12
  headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
13
 
14
 
 
37
  else:
38
  extension = file_url.split(".")[-1]
39
  if extension.lower() == "pdf":
40
+ result = PyPDFLoader(file_url, headers=headers).load()
41
+ elif extension.lower() == "odt":
42
+ temp_path = download_file_from_bubble(file_url, headers, ".odt")
43
+ full_text = splitter_utils.load_odt_file(temp_path)
44
+ result = splitter_simple.load_and_split_text(full_text)
45
+ elif extension.lower() == "txt":
46
+ temp_path = download_file_from_bubble(file_url, headers, ".txt")
47
+ result = TextLoader(temp_path).load()
48
  else:
49
+ temp_path = download_file_from_bubble(file_url, headers, ".docx")
50
+ result = Docx2txtLoader(temp_path).load()
51
 
52
+ return result
53
 
54
 
55
+ def download_file_from_bubble(url, headers, extension: str):
56
  response = requests.get(url, headers=headers)
57
  response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx)
58
 
59
  # Save the downloaded file into a temporary file
60
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
61
  with open(temp_file.name, "wb") as f:
62
  f.write(response.content) # por enquanto este arquivo não está sendo excluído
63
 
_utils/langchain_utils/Splitter_class.py CHANGED
@@ -1,6 +1,7 @@
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
2
  from _utils.handle_files import return_document_list_with_llama_parser
3
  from _utils.langchain_utils.splitter_util import (
 
4
  SplitterUtils,
5
  combine_documents_without_losing_pagination,
6
  )
@@ -9,6 +10,7 @@ from setup.easy_imports import (
9
  RecursiveCharacterTextSplitter,
10
  Document,
11
  Docx2txtLoader,
 
12
  )
13
  from typing import Any, List, Dict, Tuple, Optional, cast
14
  from _utils.models.gerar_relatorio import (
@@ -16,8 +18,6 @@ from _utils.models.gerar_relatorio import (
16
  )
17
  import uuid
18
 
19
- splitter_utils = SplitterUtils()
20
-
21
 
22
  class Splitter:
23
  def __init__(
@@ -25,6 +25,7 @@ class Splitter:
25
  chunk_size,
26
  chunk_overlap,
27
  ):
 
28
  self.splitter_simple = Splitter_Simple(chunk_size, chunk_overlap)
29
  self.text_splitter = RecursiveCharacterTextSplitter(
30
  chunk_size=chunk_size, chunk_overlap=chunk_overlap
@@ -56,11 +57,6 @@ class Splitter:
56
  pages
57
  )
58
  )
59
- # for page in pages:
60
- # full_text_as_string = full_text_as_string + page.page_content
61
- # chunks_of_string_only = chunks_of_string_only + self.text_splitter.split_text(
62
- # combined_text
63
- # )
64
  else:
65
  if should_use_llama_parse:
66
  print("\nENVIANDO PDFS PARA LLAMA PARSE")
@@ -73,10 +69,15 @@ class Splitter:
73
  )
74
  else:
75
  print("\nCOMEÇANDO LEITURA DO PDF")
76
- file_extension = splitter_utils.get_file_type(pdf_path)
77
  print("file_extension: ", file_extension)
78
  if file_extension == "pdf":
79
  pages = PyPDFLoader(pdf_path).load()
 
 
 
 
 
80
  else:
81
  pages = Docx2txtLoader(pdf_path).load()
82
  print("TERMINOU LEITURA DO PDF")
@@ -177,40 +178,3 @@ class Splitter:
177
  char_count += len(text)
178
 
179
  return chunks
180
-
181
-
182
- class Splitter_Simple:
183
- def __init__(
184
- self,
185
- chunk_size=1000,
186
- chunk_overlap=400,
187
- ):
188
- self.text_splitter = RecursiveCharacterTextSplitter(
189
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
190
- )
191
-
192
- async def load_and_split_document(self, pdf_path: str):
193
- """Load PDF and split into chunks with metadata"""
194
- print("\nCOMEÇANDO LEITURA DO PDF")
195
- pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter)
196
- print("\nTERMINADO LEITURA DO PDF")
197
-
198
- return pages
199
-
200
- def load_and_split_text(self, text: str) -> List[Document]:
201
- documents: List[Document] = []
202
- chunks = self.text_splitter.split_text(text)
203
-
204
- for chunk in chunks:
205
- documents.append(Document(page_content=chunk))
206
-
207
- return documents
208
-
209
- def get_chunks_of_string_only_from_list_of_documents(
210
- self, lista_de_documentos: List[Document]
211
- ):
212
- full_text_as_string = ""
213
- for page in lista_de_documentos:
214
- full_text_as_string = full_text_as_string + page.page_content
215
- full_text_as_array = self.text_splitter.split_text(full_text_as_string)
216
- return full_text_as_array
 
1
  from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
2
  from _utils.handle_files import return_document_list_with_llama_parser
3
  from _utils.langchain_utils.splitter_util import (
4
+ Splitter_Simple,
5
  SplitterUtils,
6
  combine_documents_without_losing_pagination,
7
  )
 
10
  RecursiveCharacterTextSplitter,
11
  Document,
12
  Docx2txtLoader,
13
+ TextLoader,
14
  )
15
  from typing import Any, List, Dict, Tuple, Optional, cast
16
  from _utils.models.gerar_relatorio import (
 
18
  )
19
  import uuid
20
 
 
 
21
 
22
  class Splitter:
23
  def __init__(
 
25
  chunk_size,
26
  chunk_overlap,
27
  ):
28
+ self.splitter_util = SplitterUtils()
29
  self.splitter_simple = Splitter_Simple(chunk_size, chunk_overlap)
30
  self.text_splitter = RecursiveCharacterTextSplitter(
31
  chunk_size=chunk_size, chunk_overlap=chunk_overlap
 
57
  pages
58
  )
59
  )
 
 
 
 
 
60
  else:
61
  if should_use_llama_parse:
62
  print("\nENVIANDO PDFS PARA LLAMA PARSE")
 
69
  )
70
  else:
71
  print("\nCOMEÇANDO LEITURA DO PDF")
72
+ file_extension = self.splitter_util.get_file_type(pdf_path)
73
  print("file_extension: ", file_extension)
74
  if file_extension == "pdf":
75
  pages = PyPDFLoader(pdf_path).load()
76
+ elif file_extension == "odt":
77
+ full_text = self.splitter_util.load_odt_file(pdf_path)
78
+ pages = self.splitter_simple.load_and_split_text(full_text)
79
+ elif file_extension == "txt":
80
+ pages = TextLoader(pdf_path).load()
81
  else:
82
  pages = Docx2txtLoader(pdf_path).load()
83
  print("TERMINOU LEITURA DO PDF")
 
178
  char_count += len(text)
179
 
180
  return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
_utils/langchain_utils/splitter_util.py CHANGED
@@ -1,6 +1,13 @@
1
  import os
2
  from typing import List, Tuple
3
  from langchain_core.documents import Document
 
 
 
 
 
 
 
4
 
5
 
6
  class SplitterUtils:
@@ -11,10 +18,57 @@ class SplitterUtils:
11
  return "pdf"
12
  elif ext == ".docx":
13
  return "word"
 
 
 
 
14
  else:
15
  print("\next", ext)
16
  return "unknown"
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def combine_documents_without_losing_pagination(documents: list[Document]):
20
  combined_text = ""
 
1
  import os
2
  from typing import List, Tuple
3
  from langchain_core.documents import Document
4
+ from odf.opendocument import load
5
+ from odf.text import P
6
+ from typing import List
7
+ from setup.easy_imports import (
8
+ PyPDFLoader,
9
+ RecursiveCharacterTextSplitter,
10
+ )
11
 
12
 
13
  class SplitterUtils:
 
18
  return "pdf"
19
  elif ext == ".docx":
20
  return "word"
21
+ elif ext == ".odt":
22
+ return "odt"
23
+ elif ext == ".txt":
24
+ return "txt"
25
  else:
26
  print("\next", ext)
27
  return "unknown"
28
 
29
+ def load_odt_file(self, file_path: str):
30
+ textdoc = load(file_path)
31
+ all_paragraphs = textdoc.getElementsByType(P)
32
+ text = "\n".join([p.firstChild.data for p in all_paragraphs if p.firstChild])
33
+ return text
34
+
35
+
36
+ class Splitter_Simple:
37
+ def __init__(
38
+ self,
39
+ chunk_size=1000,
40
+ chunk_overlap=400,
41
+ ):
42
+ self.text_splitter = RecursiveCharacterTextSplitter(
43
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
44
+ )
45
+
46
+ async def load_and_split_document(self, pdf_path: str):
47
+ """Load PDF and split into chunks with metadata"""
48
+ print("\nCOMEÇANDO LEITURA DO PDF")
49
+ pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter)
50
+ print("\nTERMINADO LEITURA DO PDF")
51
+
52
+ return pages
53
+
54
+ def load_and_split_text(self, text: str) -> List[Document]:
55
+ documents: List[Document] = []
56
+ chunks = self.text_splitter.split_text(text)
57
+
58
+ for chunk in chunks:
59
+ documents.append(Document(page_content=chunk))
60
+
61
+ return documents
62
+
63
+ def get_chunks_of_string_only_from_list_of_documents(
64
+ self, lista_de_documentos: List[Document]
65
+ ):
66
+ full_text_as_string = ""
67
+ for page in lista_de_documentos:
68
+ full_text_as_string = full_text_as_string + page.page_content
69
+ full_text_as_array = self.text_splitter.split_text(full_text_as_string)
70
+ return full_text_as_array
71
+
72
 
73
  def combine_documents_without_losing_pagination(documents: list[Document]):
74
  combined_text = ""
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
setup/easy_imports.py CHANGED
@@ -13,7 +13,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
13
  from langchain.prompts import PromptTemplate
14
  from langchain_core.runnables import RunnablePassthrough
15
  from langchain_core.prompts import ChatPromptTemplate
16
- from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
17
  from langchain_community.vectorstores import Chroma
18
  from langchain_google_genai import ChatGoogleGenerativeAI
19
 
 
13
  from langchain.prompts import PromptTemplate
14
  from langchain_core.runnables import RunnablePassthrough
15
  from langchain_core.prompts import ChatPromptTemplate
16
+ from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
17
  from langchain_community.vectorstores import Chroma
18
  from langchain_google_genai import ChatGoogleGenerativeAI
19