Spaces:
Running
Running
luanpoppe
commited on
Commit
·
5cb00b6
1
Parent(s):
09a8a72
feat: adicionado suporte para .odt e .txt
Browse files
_utils/bubble_integrations/obter_arquivo.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
# from setup.easy_imports import PyPDFLoader
|
2 |
import os
|
3 |
-
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
4 |
import tempfile
|
5 |
import requests
|
6 |
|
7 |
from _utils.handle_files import return_document_list_with_llama_parser
|
8 |
-
from _utils.langchain_utils.splitter_util import SplitterUtils
|
9 |
|
10 |
splitter_utils = SplitterUtils()
|
|
|
11 |
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
|
12 |
|
13 |
|
@@ -36,20 +37,27 @@ async def get_pdf_from_bubble(
|
|
36 |
else:
|
37 |
extension = file_url.split(".")[-1]
|
38 |
if extension.lower() == "pdf":
|
39 |
-
result = PyPDFLoader(file_url, headers=headers)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
else:
|
41 |
-
temp_path =
|
42 |
-
result = Docx2txtLoader(temp_path)
|
43 |
|
44 |
-
return result
|
45 |
|
46 |
|
47 |
-
def
|
48 |
response = requests.get(url, headers=headers)
|
49 |
response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx)
|
50 |
|
51 |
# Save the downloaded file into a temporary file
|
52 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=
|
53 |
with open(temp_file.name, "wb") as f:
|
54 |
f.write(response.content) # por enquanto este arquivo não está sendo excluído
|
55 |
|
|
|
1 |
# from setup.easy_imports import PyPDFLoader
|
2 |
import os
|
3 |
+
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
|
4 |
import tempfile
|
5 |
import requests
|
6 |
|
7 |
from _utils.handle_files import return_document_list_with_llama_parser
|
8 |
+
from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
|
9 |
|
10 |
splitter_utils = SplitterUtils()
|
11 |
+
splitter_simple = Splitter_Simple()
|
12 |
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
|
13 |
|
14 |
|
|
|
37 |
else:
|
38 |
extension = file_url.split(".")[-1]
|
39 |
if extension.lower() == "pdf":
|
40 |
+
result = PyPDFLoader(file_url, headers=headers).load()
|
41 |
+
elif extension.lower() == "odt":
|
42 |
+
temp_path = download_file_from_bubble(file_url, headers, ".odt")
|
43 |
+
full_text = splitter_utils.load_odt_file(temp_path)
|
44 |
+
result = splitter_simple.load_and_split_text(full_text)
|
45 |
+
elif extension.lower() == "txt":
|
46 |
+
temp_path = download_file_from_bubble(file_url, headers, ".txt")
|
47 |
+
result = TextLoader(temp_path).load()
|
48 |
else:
|
49 |
+
temp_path = download_file_from_bubble(file_url, headers, ".docx")
|
50 |
+
result = Docx2txtLoader(temp_path).load()
|
51 |
|
52 |
+
return result
|
53 |
|
54 |
|
55 |
+
def download_file_from_bubble(url, headers, extension: str):
|
56 |
response = requests.get(url, headers=headers)
|
57 |
response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx)
|
58 |
|
59 |
# Save the downloaded file into a temporary file
|
60 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
|
61 |
with open(temp_file.name, "wb") as f:
|
62 |
f.write(response.content) # por enquanto este arquivo não está sendo excluído
|
63 |
|
_utils/langchain_utils/Splitter_class.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
2 |
from _utils.handle_files import return_document_list_with_llama_parser
|
3 |
from _utils.langchain_utils.splitter_util import (
|
|
|
4 |
SplitterUtils,
|
5 |
combine_documents_without_losing_pagination,
|
6 |
)
|
@@ -9,6 +10,7 @@ from setup.easy_imports import (
|
|
9 |
RecursiveCharacterTextSplitter,
|
10 |
Document,
|
11 |
Docx2txtLoader,
|
|
|
12 |
)
|
13 |
from typing import Any, List, Dict, Tuple, Optional, cast
|
14 |
from _utils.models.gerar_relatorio import (
|
@@ -16,8 +18,6 @@ from _utils.models.gerar_relatorio import (
|
|
16 |
)
|
17 |
import uuid
|
18 |
|
19 |
-
splitter_utils = SplitterUtils()
|
20 |
-
|
21 |
|
22 |
class Splitter:
|
23 |
def __init__(
|
@@ -25,6 +25,7 @@ class Splitter:
|
|
25 |
chunk_size,
|
26 |
chunk_overlap,
|
27 |
):
|
|
|
28 |
self.splitter_simple = Splitter_Simple(chunk_size, chunk_overlap)
|
29 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
30 |
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
@@ -56,11 +57,6 @@ class Splitter:
|
|
56 |
pages
|
57 |
)
|
58 |
)
|
59 |
-
# for page in pages:
|
60 |
-
# full_text_as_string = full_text_as_string + page.page_content
|
61 |
-
# chunks_of_string_only = chunks_of_string_only + self.text_splitter.split_text(
|
62 |
-
# combined_text
|
63 |
-
# )
|
64 |
else:
|
65 |
if should_use_llama_parse:
|
66 |
print("\nENVIANDO PDFS PARA LLAMA PARSE")
|
@@ -73,10 +69,15 @@ class Splitter:
|
|
73 |
)
|
74 |
else:
|
75 |
print("\nCOMEÇANDO LEITURA DO PDF")
|
76 |
-
file_extension =
|
77 |
print("file_extension: ", file_extension)
|
78 |
if file_extension == "pdf":
|
79 |
pages = PyPDFLoader(pdf_path).load()
|
|
|
|
|
|
|
|
|
|
|
80 |
else:
|
81 |
pages = Docx2txtLoader(pdf_path).load()
|
82 |
print("TERMINOU LEITURA DO PDF")
|
@@ -177,40 +178,3 @@ class Splitter:
|
|
177 |
char_count += len(text)
|
178 |
|
179 |
return chunks
|
180 |
-
|
181 |
-
|
182 |
-
class Splitter_Simple:
|
183 |
-
def __init__(
|
184 |
-
self,
|
185 |
-
chunk_size=1000,
|
186 |
-
chunk_overlap=400,
|
187 |
-
):
|
188 |
-
self.text_splitter = RecursiveCharacterTextSplitter(
|
189 |
-
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
190 |
-
)
|
191 |
-
|
192 |
-
async def load_and_split_document(self, pdf_path: str):
|
193 |
-
"""Load PDF and split into chunks with metadata"""
|
194 |
-
print("\nCOMEÇANDO LEITURA DO PDF")
|
195 |
-
pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter)
|
196 |
-
print("\nTERMINADO LEITURA DO PDF")
|
197 |
-
|
198 |
-
return pages
|
199 |
-
|
200 |
-
def load_and_split_text(self, text: str) -> List[Document]:
|
201 |
-
documents: List[Document] = []
|
202 |
-
chunks = self.text_splitter.split_text(text)
|
203 |
-
|
204 |
-
for chunk in chunks:
|
205 |
-
documents.append(Document(page_content=chunk))
|
206 |
-
|
207 |
-
return documents
|
208 |
-
|
209 |
-
def get_chunks_of_string_only_from_list_of_documents(
|
210 |
-
self, lista_de_documentos: List[Document]
|
211 |
-
):
|
212 |
-
full_text_as_string = ""
|
213 |
-
for page in lista_de_documentos:
|
214 |
-
full_text_as_string = full_text_as_string + page.page_content
|
215 |
-
full_text_as_array = self.text_splitter.split_text(full_text_as_string)
|
216 |
-
return full_text_as_array
|
|
|
1 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
2 |
from _utils.handle_files import return_document_list_with_llama_parser
|
3 |
from _utils.langchain_utils.splitter_util import (
|
4 |
+
Splitter_Simple,
|
5 |
SplitterUtils,
|
6 |
combine_documents_without_losing_pagination,
|
7 |
)
|
|
|
10 |
RecursiveCharacterTextSplitter,
|
11 |
Document,
|
12 |
Docx2txtLoader,
|
13 |
+
TextLoader,
|
14 |
)
|
15 |
from typing import Any, List, Dict, Tuple, Optional, cast
|
16 |
from _utils.models.gerar_relatorio import (
|
|
|
18 |
)
|
19 |
import uuid
|
20 |
|
|
|
|
|
21 |
|
22 |
class Splitter:
|
23 |
def __init__(
|
|
|
25 |
chunk_size,
|
26 |
chunk_overlap,
|
27 |
):
|
28 |
+
self.splitter_util = SplitterUtils()
|
29 |
self.splitter_simple = Splitter_Simple(chunk_size, chunk_overlap)
|
30 |
self.text_splitter = RecursiveCharacterTextSplitter(
|
31 |
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
|
57 |
pages
|
58 |
)
|
59 |
)
|
|
|
|
|
|
|
|
|
|
|
60 |
else:
|
61 |
if should_use_llama_parse:
|
62 |
print("\nENVIANDO PDFS PARA LLAMA PARSE")
|
|
|
69 |
)
|
70 |
else:
|
71 |
print("\nCOMEÇANDO LEITURA DO PDF")
|
72 |
+
file_extension = self.splitter_util.get_file_type(pdf_path)
|
73 |
print("file_extension: ", file_extension)
|
74 |
if file_extension == "pdf":
|
75 |
pages = PyPDFLoader(pdf_path).load()
|
76 |
+
elif file_extension == "odt":
|
77 |
+
full_text = self.splitter_util.load_odt_file(pdf_path)
|
78 |
+
pages = self.splitter_simple.load_and_split_text(full_text)
|
79 |
+
elif file_extension == "txt":
|
80 |
+
pages = TextLoader(pdf_path).load()
|
81 |
else:
|
82 |
pages = Docx2txtLoader(pdf_path).load()
|
83 |
print("TERMINOU LEITURA DO PDF")
|
|
|
178 |
char_count += len(text)
|
179 |
|
180 |
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_utils/langchain_utils/splitter_util.py
CHANGED
@@ -1,6 +1,13 @@
|
|
1 |
import os
|
2 |
from typing import List, Tuple
|
3 |
from langchain_core.documents import Document
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
class SplitterUtils:
|
@@ -11,10 +18,57 @@ class SplitterUtils:
|
|
11 |
return "pdf"
|
12 |
elif ext == ".docx":
|
13 |
return "word"
|
|
|
|
|
|
|
|
|
14 |
else:
|
15 |
print("\next", ext)
|
16 |
return "unknown"
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def combine_documents_without_losing_pagination(documents: list[Document]):
|
20 |
combined_text = ""
|
|
|
1 |
import os
|
2 |
from typing import List, Tuple
|
3 |
from langchain_core.documents import Document
|
4 |
+
from odf.opendocument import load
|
5 |
+
from odf.text import P
|
6 |
+
from typing import List
|
7 |
+
from setup.easy_imports import (
|
8 |
+
PyPDFLoader,
|
9 |
+
RecursiveCharacterTextSplitter,
|
10 |
+
)
|
11 |
|
12 |
|
13 |
class SplitterUtils:
|
|
|
18 |
return "pdf"
|
19 |
elif ext == ".docx":
|
20 |
return "word"
|
21 |
+
elif ext == ".odt":
|
22 |
+
return "odt"
|
23 |
+
elif ext == ".txt":
|
24 |
+
return "txt"
|
25 |
else:
|
26 |
print("\next", ext)
|
27 |
return "unknown"
|
28 |
|
29 |
+
def load_odt_file(self, file_path: str):
|
30 |
+
textdoc = load(file_path)
|
31 |
+
all_paragraphs = textdoc.getElementsByType(P)
|
32 |
+
text = "\n".join([p.firstChild.data for p in all_paragraphs if p.firstChild])
|
33 |
+
return text
|
34 |
+
|
35 |
+
|
36 |
+
class Splitter_Simple:
|
37 |
+
def __init__(
|
38 |
+
self,
|
39 |
+
chunk_size=1000,
|
40 |
+
chunk_overlap=400,
|
41 |
+
):
|
42 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
43 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
44 |
+
)
|
45 |
+
|
46 |
+
async def load_and_split_document(self, pdf_path: str):
|
47 |
+
"""Load PDF and split into chunks with metadata"""
|
48 |
+
print("\nCOMEÇANDO LEITURA DO PDF")
|
49 |
+
pages = PyPDFLoader(pdf_path).load_and_split(self.text_splitter)
|
50 |
+
print("\nTERMINADO LEITURA DO PDF")
|
51 |
+
|
52 |
+
return pages
|
53 |
+
|
54 |
+
def load_and_split_text(self, text: str) -> List[Document]:
|
55 |
+
documents: List[Document] = []
|
56 |
+
chunks = self.text_splitter.split_text(text)
|
57 |
+
|
58 |
+
for chunk in chunks:
|
59 |
+
documents.append(Document(page_content=chunk))
|
60 |
+
|
61 |
+
return documents
|
62 |
+
|
63 |
+
def get_chunks_of_string_only_from_list_of_documents(
|
64 |
+
self, lista_de_documentos: List[Document]
|
65 |
+
):
|
66 |
+
full_text_as_string = ""
|
67 |
+
for page in lista_de_documentos:
|
68 |
+
full_text_as_string = full_text_as_string + page.page_content
|
69 |
+
full_text_as_array = self.text_splitter.split_text(full_text_as_string)
|
70 |
+
return full_text_as_array
|
71 |
+
|
72 |
|
73 |
def combine_documents_without_losing_pagination(documents: list[Document]):
|
74 |
combined_text = ""
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
setup/easy_imports.py
CHANGED
@@ -13,7 +13,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
from langchain_core.runnables import RunnablePassthrough
|
15 |
from langchain_core.prompts import ChatPromptTemplate
|
16 |
-
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
17 |
from langchain_community.vectorstores import Chroma
|
18 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
19 |
|
|
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
from langchain_core.runnables import RunnablePassthrough
|
15 |
from langchain_core.prompts import ChatPromptTemplate
|
16 |
+
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
|
17 |
from langchain_community.vectorstores import Chroma
|
18 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
19 |
|