Spaces:
Running
Running
File size: 3,217 Bytes
cb23311 ab79998 55f46c1 23087eb 55f46c1 5cb00b6 55f46c1 6e09bf4 5cb00b6 cb23311 23087eb 55f46c1 cb23311 55f46c1 cb23311 55f46c1 6e09bf4 ab79998 5cb00b6 668a7d5 3462a1d 4ef8d92 668a7d5 6e09bf4 5cb00b6 55f46c1 5cb00b6 6e09bf4 5cb00b6 6e09bf4 5cb00b6 6e09bf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# from setup.easy_imports import PyPDFLoader
import os
from langchain_community.document_loaders import (
PyPDFLoader,
Docx2txtLoader,
TextLoader,
PyMuPDFLoader,
)
import tempfile
import requests
from _utils.handle_files import return_document_list_with_llama_parser
from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
splitter_utils = SplitterUtils()
splitter_simple = Splitter_Simple()
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
# def obter_arquivo(id_arquivo="1735864318176x375804955201372160"):
# return requests.get(
# f"https://vella.app.br/version-test/api/1.1/obj/formresponseanswer/{id_arquivo}",
# headers=headers,
# )
async def get_pdf_from_bubble(
file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
should_use_llama_parse=False,
):
if should_use_llama_parse:
response = requests.get(file_url, headers=headers)
response.raise_for_status()
# Create a NamedTemporaryFile (with a .pdf suffix) to store the file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
# Write the response content to the temporary file
tmp_file.write(response.content)
return await return_document_list_with_llama_parser(
tmp_file.name
) # por enquanto este arquivo não está sendo excluído
else:
extension = file_url.split(".")[-1]
if extension.lower() == "pdf":
try:
result = PyPDFLoader(file_url, headers=headers).load()
except:
result = PyMuPDFLoader(file_url, headers=headers).load()
elif extension.lower() == "odt":
temp_path = download_file_from_bubble(file_url, headers, ".odt")
full_text = splitter_utils.load_odt_file(temp_path)
result = splitter_simple.load_and_split_text(full_text)
elif extension.lower() == "txt":
temp_path = download_file_from_bubble(file_url, headers, ".txt")
result = TextLoader(temp_path).load()
elif extension.lower() == "doc":
temp_path = download_file_from_bubble(file_url, headers, ".doc")
# full_text_binary = textract.process(temp_path)
full_text = splitter_utils.getTextFromDotDoc(temp_path)
result = splitter_simple.load_and_split_text(full_text)
else:
temp_path = download_file_from_bubble(file_url, headers, ".docx")
result = Docx2txtLoader(temp_path).load()
return result
def download_file_from_bubble(url, headers, extension: str):
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx)
# Save the downloaded file into a temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
with open(temp_file.name, "wb") as f:
f.write(response.content) # por enquanto este arquivo não está sendo excluído
return temp_file.name
|