Spaces:
Running
Running
luanpoppe
commited on
Commit
·
6e09bf4
1
Parent(s):
d8410b4
feat: adicionando suporte a arquivos do word
Browse files- _utils/bubble_integrations/obter_arquivo.py +21 -2
- _utils/handle_files.py +4 -1
- _utils/splitters/Splitter_class.py +20 -4
- _utils/splitters/splitter_util.py +14 -0
- requirements.txt +0 -0
- setup/easy_imports.py +1 -1
_utils/bubble_integrations/obter_arquivo.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
# from setup.easy_imports import PyPDFLoader
|
2 |
import os
|
3 |
-
from langchain_community.document_loaders import PyPDFLoader
|
4 |
import tempfile
|
5 |
import requests
|
6 |
|
7 |
from _utils.handle_files import return_document_list_with_llama_parser
|
|
|
8 |
|
|
|
9 |
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
|
10 |
|
11 |
|
@@ -32,6 +34,23 @@ async def get_pdf_from_bubble(
|
|
32 |
tmp_file.name
|
33 |
) # por enquanto este arquivo não está sendo excluído
|
34 |
else:
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
return result.load()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# from setup.easy_imports import PyPDFLoader
|
2 |
import os
|
3 |
+
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
4 |
import tempfile
|
5 |
import requests
|
6 |
|
7 |
from _utils.handle_files import return_document_list_with_llama_parser
|
8 |
+
from _utils.splitters.splitter_util import SplitterUtils
|
9 |
|
10 |
+
splitter_utils = SplitterUtils()
|
11 |
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
|
12 |
|
13 |
|
|
|
34 |
tmp_file.name
|
35 |
) # por enquanto este arquivo não está sendo excluído
|
36 |
else:
|
37 |
+
extension = file_url.split(".")[-1]
|
38 |
+
if extension.lower() == "pdf":
|
39 |
+
result = PyPDFLoader(file_url, headers=headers)
|
40 |
+
else:
|
41 |
+
temp_path = download_docx(file_url, headers)
|
42 |
+
result = Docx2txtLoader(temp_path)
|
43 |
|
44 |
return result.load()
|
45 |
+
|
46 |
+
|
47 |
+
def download_docx(url, headers):
|
48 |
+
response = requests.get(url, headers=headers)
|
49 |
+
response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx)
|
50 |
+
|
51 |
+
# Save the downloaded file into a temporary file
|
52 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
|
53 |
+
with open(temp_file.name, "wb") as f:
|
54 |
+
f.write(response.content) # por enquanto este arquivo não está sendo excluído
|
55 |
+
|
56 |
+
return temp_file.name
|
_utils/handle_files.py
CHANGED
@@ -4,6 +4,8 @@ from langchain_core.documents import Document as LangchainDocument
|
|
4 |
from llama_index import Document
|
5 |
from llama_parse import LlamaParse, ResultType
|
6 |
|
|
|
|
|
7 |
llama_parser_keys = [
|
8 |
os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
|
9 |
os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
|
@@ -13,9 +15,10 @@ llama_parser_keys = [
|
|
13 |
def handle_pdf_files_from_serializer(files):
|
14 |
listaPDFs = []
|
15 |
for file in files:
|
|
|
16 |
file.seek(0)
|
17 |
with tempfile.NamedTemporaryFile(
|
18 |
-
delete=False, suffix=".
|
19 |
) as temp_file: # Create a temporary file to save the uploaded PDF
|
20 |
for (
|
21 |
chunk
|
|
|
4 |
from llama_index import Document
|
5 |
from llama_parse import LlamaParse, ResultType
|
6 |
|
7 |
+
from _utils.splitters.splitter_util import SplitterUtils
|
8 |
+
|
9 |
llama_parser_keys = [
|
10 |
os.getenv("LLAMA_CLOUD_API_KEY_POPS"),
|
11 |
os.getenv("LLAMA_CLOUD_API_KEY_PEIXE"),
|
|
|
15 |
def handle_pdf_files_from_serializer(files):
|
16 |
listaPDFs = []
|
17 |
for file in files:
|
18 |
+
file_extension = file.name.split(".")[-1]
|
19 |
file.seek(0)
|
20 |
with tempfile.NamedTemporaryFile(
|
21 |
+
delete=False, suffix=f".{file_extension}"
|
22 |
) as temp_file: # Create a temporary file to save the uploaded PDF
|
23 |
for (
|
24 |
chunk
|
_utils/splitters/Splitter_class.py
CHANGED
@@ -1,13 +1,23 @@
|
|
1 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
2 |
from _utils.handle_files import return_document_list_with_llama_parser
|
3 |
-
from _utils.splitters.splitter_util import
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from typing import Any, List, Dict, Tuple, Optional, cast
|
6 |
from _utils.models.gerar_relatorio import (
|
7 |
DocumentChunk,
|
8 |
)
|
9 |
import uuid
|
10 |
|
|
|
|
|
11 |
|
12 |
class Splitter:
|
13 |
def __init__(
|
@@ -34,7 +44,7 @@ class Splitter:
|
|
34 |
|
35 |
if isBubble:
|
36 |
print("\nPEGANDO PDF DO BUBBLE")
|
37 |
-
pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse)
|
38 |
page_boundaries, combined_text = (
|
39 |
combine_documents_without_losing_pagination(pages)
|
40 |
)
|
@@ -53,8 +63,14 @@ class Splitter:
|
|
53 |
)
|
54 |
else:
|
55 |
print("\nCOMEÇANDO LEITURA DO PDF")
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
57 |
print("TERMINOU LEITURA DO PDF")
|
|
|
58 |
page_boundaries, combined_text = (
|
59 |
combine_documents_without_losing_pagination(pages)
|
60 |
)
|
|
|
1 |
from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
|
2 |
from _utils.handle_files import return_document_list_with_llama_parser
|
3 |
+
from _utils.splitters.splitter_util import (
|
4 |
+
SplitterUtils,
|
5 |
+
combine_documents_without_losing_pagination,
|
6 |
+
)
|
7 |
+
from setup.easy_imports import (
|
8 |
+
PyPDFLoader,
|
9 |
+
RecursiveCharacterTextSplitter,
|
10 |
+
Document,
|
11 |
+
Docx2txtLoader,
|
12 |
+
)
|
13 |
from typing import Any, List, Dict, Tuple, Optional, cast
|
14 |
from _utils.models.gerar_relatorio import (
|
15 |
DocumentChunk,
|
16 |
)
|
17 |
import uuid
|
18 |
|
19 |
+
splitter_utils = SplitterUtils()
|
20 |
+
|
21 |
|
22 |
class Splitter:
|
23 |
def __init__(
|
|
|
44 |
|
45 |
if isBubble:
|
46 |
print("\nPEGANDO PDF DO BUBBLE")
|
47 |
+
pages = await get_pdf_from_bubble(pdf_path, should_use_llama_parse) # type: ignore
|
48 |
page_boundaries, combined_text = (
|
49 |
combine_documents_without_losing_pagination(pages)
|
50 |
)
|
|
|
63 |
)
|
64 |
else:
|
65 |
print("\nCOMEÇANDO LEITURA DO PDF")
|
66 |
+
file_extension = splitter_utils.get_file_type(pdf_path)
|
67 |
+
print("file_extension: ", file_extension)
|
68 |
+
if file_extension == "pdf":
|
69 |
+
pages = PyPDFLoader(pdf_path).load()
|
70 |
+
else:
|
71 |
+
pages = Docx2txtLoader(pdf_path).load()
|
72 |
print("TERMINOU LEITURA DO PDF")
|
73 |
+
print("pages: ", pages)
|
74 |
page_boundaries, combined_text = (
|
75 |
combine_documents_without_losing_pagination(pages)
|
76 |
)
|
_utils/splitters/splitter_util.py
CHANGED
@@ -1,7 +1,21 @@
|
|
|
|
1 |
from typing import List, Tuple
|
2 |
from langchain_core.documents import Document
|
3 |
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
def combine_documents_without_losing_pagination(documents: list[Document]):
|
6 |
combined_text = ""
|
7 |
page_boundaries: List[Tuple[int, int, int]] = (
|
|
|
1 |
+
import os
|
2 |
from typing import List, Tuple
|
3 |
from langchain_core.documents import Document
|
4 |
|
5 |
|
6 |
+
class SplitterUtils:
|
7 |
+
def get_file_type(self, file_path):
|
8 |
+
_, ext = os.path.splitext(file_path)
|
9 |
+
ext = ext.lower() # Normalize to lowercase
|
10 |
+
if ext == ".pdf":
|
11 |
+
return "pdf"
|
12 |
+
elif ext == ".docx":
|
13 |
+
return "word"
|
14 |
+
else:
|
15 |
+
print("\next", ext)
|
16 |
+
return "unknown"
|
17 |
+
|
18 |
+
|
19 |
def combine_documents_without_losing_pagination(documents: list[Document]):
|
20 |
combined_text = ""
|
21 |
page_boundaries: List[Tuple[int, int, int]] = (
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
setup/easy_imports.py
CHANGED
@@ -12,7 +12,7 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
|
12 |
# from langchain_community.embeddings import HuggingFaceEmbeddings
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
from langchain_core.prompts import ChatPromptTemplate
|
15 |
-
from langchain_community.document_loaders import PyPDFLoader
|
16 |
from langchain_community.vectorstores import Chroma
|
17 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
18 |
|
|
|
12 |
# from langchain_community.embeddings import HuggingFaceEmbeddings
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
from langchain_core.prompts import ChatPromptTemplate
|
15 |
+
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
|
16 |
from langchain_community.vectorstores import Chroma
|
17 |
from langchain_google_genai import ChatGoogleGenerativeAI
|
18 |
|