Spaces:

luanpoppe
/

vella-backend-tests

Running

luanpoppe commited on Apr 25

Commit

668a7d5

1 Parent(s): 0472543

feat: adicionando suporte a arquivos .doc

Files changed (5) hide show

Dockerfile CHANGED Viewed

@@ -8,6 +8,9 @@ COPY --chown=user . ./app
 WORKDIR /app
 RUN pip install --no-cache-dir -r requirements.txt
 # RUN python3 -m venv /app/.venv

 WORKDIR /app
+# Instalação necessária para converter arquivos .doc
+RUN apt-get update && apt-get install -y antiword
 RUN pip install --no-cache-dir -r requirements.txt
 # RUN python3 -m venv /app/.venv

_utils/bubble_integrations/obter_arquivo.py CHANGED Viewed

@@ -8,6 +8,7 @@ from langchain_community.document_loaders import (
 )
 import tempfile
 import requests
 from _utils.handle_files import return_document_list_with_llama_parser
 from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
@@ -53,6 +54,11 @@ async def get_pdf_from_bubble(
         elif extension.lower() == "txt":
             temp_path = download_file_from_bubble(file_url, headers, ".txt")
             result = TextLoader(temp_path).load()
         else:
             temp_path = download_file_from_bubble(file_url, headers, ".docx")
             result = Docx2txtLoader(temp_path).load()

 )
 import tempfile
 import requests
+import textract
 from _utils.handle_files import return_document_list_with_llama_parser
 from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
         elif extension.lower() == "txt":
             temp_path = download_file_from_bubble(file_url, headers, ".txt")
             result = TextLoader(temp_path).load()
+        elif extension.lower() == "doc":
+            temp_path = download_file_from_bubble(file_url, headers, ".doc")
+            full_text_binary = textract.process(temp_path)
+            full_text = full_text_binary.decode("utf-8")
+            result = splitter_simple.load_and_split_text(full_text)
         else:
             temp_path = download_file_from_bubble(file_url, headers, ".docx")
             result = Docx2txtLoader(temp_path).load()

_utils/langchain_utils/Splitter_class.py CHANGED Viewed

@@ -18,6 +18,7 @@ from _utils.models.gerar_documento import (
     DocumentChunk,
 )
 import uuid
 class Splitter:
@@ -81,6 +82,10 @@ class Splitter:
                     pages = self.splitter_simple.load_and_split_text(full_text)
                 elif file_extension == "txt":
                     pages = TextLoader(pdf_path).load()
                 else:
                     pages = Docx2txtLoader(pdf_path).load()
                 print("TERMINOU LEITURA DO PDF")

     DocumentChunk,
 )
 import uuid
+import textract
 class Splitter:
                     pages = self.splitter_simple.load_and_split_text(full_text)
                 elif file_extension == "txt":
                     pages = TextLoader(pdf_path).load()
+                elif file_extension == "doc":
+                    full_text_binary = textract.process(pdf_path)
+                    full_text = full_text_binary.decode("utf-8")
+                    pages = self.splitter_simple.load_and_split_text(full_text)
                 else:
                     pages = Docx2txtLoader(pdf_path).load()
                 print("TERMINOU LEITURA DO PDF")

_utils/langchain_utils/splitter_util.py CHANGED Viewed

@@ -22,6 +22,8 @@ class SplitterUtils:
             return "odt"
         elif ext == ".txt":
             return "txt"
         else:
             print("\next", ext)
             return "unknown"

             return "odt"
         elif ext == ".txt":
             return "txt"
+        elif ext == ".doc":
+            return "doc"
         else:
             print("\next", ext)
             return "unknown"

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ