luanpoppe commited on
Commit
668a7d5
·
1 Parent(s): 0472543

feat: adicionando suporte a arquivos .doc

Browse files
Dockerfile CHANGED
@@ -8,6 +8,9 @@ COPY --chown=user . ./app
8
 
9
  WORKDIR /app
10
 
 
 
 
11
  RUN pip install --no-cache-dir -r requirements.txt
12
 
13
  # RUN python3 -m venv /app/.venv
 
8
 
9
  WORKDIR /app
10
 
11
+ # Instalação necessária para converter arquivos .doc
12
+ RUN apt-get update && apt-get install -y antiword
13
+
14
  RUN pip install --no-cache-dir -r requirements.txt
15
 
16
  # RUN python3 -m venv /app/.venv
_utils/bubble_integrations/obter_arquivo.py CHANGED
@@ -8,6 +8,7 @@ from langchain_community.document_loaders import (
8
  )
9
  import tempfile
10
  import requests
 
11
 
12
  from _utils.handle_files import return_document_list_with_llama_parser
13
  from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
@@ -53,6 +54,11 @@ async def get_pdf_from_bubble(
53
  elif extension.lower() == "txt":
54
  temp_path = download_file_from_bubble(file_url, headers, ".txt")
55
  result = TextLoader(temp_path).load()
 
 
 
 
 
56
  else:
57
  temp_path = download_file_from_bubble(file_url, headers, ".docx")
58
  result = Docx2txtLoader(temp_path).load()
 
8
  )
9
  import tempfile
10
  import requests
11
+ import textract
12
 
13
  from _utils.handle_files import return_document_list_with_llama_parser
14
  from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
 
54
  elif extension.lower() == "txt":
55
  temp_path = download_file_from_bubble(file_url, headers, ".txt")
56
  result = TextLoader(temp_path).load()
57
+ elif extension.lower() == "doc":
58
+ temp_path = download_file_from_bubble(file_url, headers, ".doc")
59
+ full_text_binary = textract.process(temp_path)
60
+ full_text = full_text_binary.decode("utf-8")
61
+ result = splitter_simple.load_and_split_text(full_text)
62
  else:
63
  temp_path = download_file_from_bubble(file_url, headers, ".docx")
64
  result = Docx2txtLoader(temp_path).load()
_utils/langchain_utils/Splitter_class.py CHANGED
@@ -18,6 +18,7 @@ from _utils.models.gerar_documento import (
18
  DocumentChunk,
19
  )
20
  import uuid
 
21
 
22
 
23
  class Splitter:
@@ -81,6 +82,10 @@ class Splitter:
81
  pages = self.splitter_simple.load_and_split_text(full_text)
82
  elif file_extension == "txt":
83
  pages = TextLoader(pdf_path).load()
 
 
 
 
84
  else:
85
  pages = Docx2txtLoader(pdf_path).load()
86
  print("TERMINOU LEITURA DO PDF")
 
18
  DocumentChunk,
19
  )
20
  import uuid
21
+ import textract
22
 
23
 
24
  class Splitter:
 
82
  pages = self.splitter_simple.load_and_split_text(full_text)
83
  elif file_extension == "txt":
84
  pages = TextLoader(pdf_path).load()
85
+ elif file_extension == "doc":
86
+ full_text_binary = textract.process(pdf_path)
87
+ full_text = full_text_binary.decode("utf-8")
88
+ pages = self.splitter_simple.load_and_split_text(full_text)
89
  else:
90
  pages = Docx2txtLoader(pdf_path).load()
91
  print("TERMINOU LEITURA DO PDF")
_utils/langchain_utils/splitter_util.py CHANGED
@@ -22,6 +22,8 @@ class SplitterUtils:
22
  return "odt"
23
  elif ext == ".txt":
24
  return "txt"
 
 
25
  else:
26
  print("\next", ext)
27
  return "unknown"
 
22
  return "odt"
23
  elif ext == ".txt":
24
  return "txt"
25
+ elif ext == ".doc":
26
+ return "doc"
27
  else:
28
  print("\next", ext)
29
  return "unknown"
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ