Spaces:
Running
Running
luanpoppe
commited on
Commit
·
668a7d5
1
Parent(s):
0472543
feat: adicionando suporte a arquivos .doc
Browse files
Dockerfile
CHANGED
@@ -8,6 +8,9 @@ COPY --chown=user . ./app
|
|
8 |
|
9 |
WORKDIR /app
|
10 |
|
|
|
|
|
|
|
11 |
RUN pip install --no-cache-dir -r requirements.txt
|
12 |
|
13 |
# RUN python3 -m venv /app/.venv
|
|
|
8 |
|
9 |
WORKDIR /app
|
10 |
|
11 |
+
# Instalação necessária para converter arquivos .doc
|
12 |
+
RUN apt-get update && apt-get install -y antiword
|
13 |
+
|
14 |
RUN pip install --no-cache-dir -r requirements.txt
|
15 |
|
16 |
# RUN python3 -m venv /app/.venv
|
_utils/bubble_integrations/obter_arquivo.py
CHANGED
@@ -8,6 +8,7 @@ from langchain_community.document_loaders import (
|
|
8 |
)
|
9 |
import tempfile
|
10 |
import requests
|
|
|
11 |
|
12 |
from _utils.handle_files import return_document_list_with_llama_parser
|
13 |
from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
|
@@ -53,6 +54,11 @@ async def get_pdf_from_bubble(
|
|
53 |
elif extension.lower() == "txt":
|
54 |
temp_path = download_file_from_bubble(file_url, headers, ".txt")
|
55 |
result = TextLoader(temp_path).load()
|
|
|
|
|
|
|
|
|
|
|
56 |
else:
|
57 |
temp_path = download_file_from_bubble(file_url, headers, ".docx")
|
58 |
result = Docx2txtLoader(temp_path).load()
|
|
|
8 |
)
|
9 |
import tempfile
|
10 |
import requests
|
11 |
+
import textract
|
12 |
|
13 |
from _utils.handle_files import return_document_list_with_llama_parser
|
14 |
from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
|
|
|
54 |
elif extension.lower() == "txt":
|
55 |
temp_path = download_file_from_bubble(file_url, headers, ".txt")
|
56 |
result = TextLoader(temp_path).load()
|
57 |
+
elif extension.lower() == "doc":
|
58 |
+
temp_path = download_file_from_bubble(file_url, headers, ".doc")
|
59 |
+
full_text_binary = textract.process(temp_path)
|
60 |
+
full_text = full_text_binary.decode("utf-8")
|
61 |
+
result = splitter_simple.load_and_split_text(full_text)
|
62 |
else:
|
63 |
temp_path = download_file_from_bubble(file_url, headers, ".docx")
|
64 |
result = Docx2txtLoader(temp_path).load()
|
_utils/langchain_utils/Splitter_class.py
CHANGED
@@ -18,6 +18,7 @@ from _utils.models.gerar_documento import (
|
|
18 |
DocumentChunk,
|
19 |
)
|
20 |
import uuid
|
|
|
21 |
|
22 |
|
23 |
class Splitter:
|
@@ -81,6 +82,10 @@ class Splitter:
|
|
81 |
pages = self.splitter_simple.load_and_split_text(full_text)
|
82 |
elif file_extension == "txt":
|
83 |
pages = TextLoader(pdf_path).load()
|
|
|
|
|
|
|
|
|
84 |
else:
|
85 |
pages = Docx2txtLoader(pdf_path).load()
|
86 |
print("TERMINOU LEITURA DO PDF")
|
|
|
18 |
DocumentChunk,
|
19 |
)
|
20 |
import uuid
|
21 |
+
import textract
|
22 |
|
23 |
|
24 |
class Splitter:
|
|
|
82 |
pages = self.splitter_simple.load_and_split_text(full_text)
|
83 |
elif file_extension == "txt":
|
84 |
pages = TextLoader(pdf_path).load()
|
85 |
+
elif file_extension == "doc":
|
86 |
+
full_text_binary = textract.process(pdf_path)
|
87 |
+
full_text = full_text_binary.decode("utf-8")
|
88 |
+
pages = self.splitter_simple.load_and_split_text(full_text)
|
89 |
else:
|
90 |
pages = Docx2txtLoader(pdf_path).load()
|
91 |
print("TERMINOU LEITURA DO PDF")
|
_utils/langchain_utils/splitter_util.py
CHANGED
@@ -22,6 +22,8 @@ class SplitterUtils:
|
|
22 |
return "odt"
|
23 |
elif ext == ".txt":
|
24 |
return "txt"
|
|
|
|
|
25 |
else:
|
26 |
print("\next", ext)
|
27 |
return "unknown"
|
|
|
22 |
return "odt"
|
23 |
elif ext == ".txt":
|
24 |
return "txt"
|
25 |
+
elif ext == ".doc":
|
26 |
+
return "doc"
|
27 |
else:
|
28 |
print("\next", ext)
|
29 |
return "unknown"
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|