Spaces:
Running
Running
File size: 2,752 Bytes
cb23311 5cb00b6 55f46c1 23087eb 55f46c1 5cb00b6 55f46c1 6e09bf4 5cb00b6 cb23311 23087eb 55f46c1 cb23311 55f46c1 cb23311 55f46c1 6e09bf4 5cb00b6 6e09bf4 5cb00b6 55f46c1 5cb00b6 6e09bf4 5cb00b6 6e09bf4 5cb00b6 6e09bf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# from setup.easy_imports import PyPDFLoader
import os
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
import tempfile
import requests
from _utils.handle_files import return_document_list_with_llama_parser
from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils
splitter_utils = SplitterUtils()
splitter_simple = Splitter_Simple()
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}
# def obter_arquivo(id_arquivo="1735864318176x375804955201372160"):
# return requests.get(
# f"https://vella.app.br/version-test/api/1.1/obj/formresponseanswer/{id_arquivo}",
# headers=headers,
# )
async def get_pdf_from_bubble(
file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
should_use_llama_parse=False,
):
if should_use_llama_parse:
response = requests.get(file_url, headers=headers)
response.raise_for_status()
# Create a NamedTemporaryFile (with a .pdf suffix) to store the file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
# Write the response content to the temporary file
tmp_file.write(response.content)
return await return_document_list_with_llama_parser(
tmp_file.name
) # por enquanto este arquivo não está sendo excluído
else:
extension = file_url.split(".")[-1]
if extension.lower() == "pdf":
result = PyPDFLoader(file_url, headers=headers).load()
elif extension.lower() == "odt":
temp_path = download_file_from_bubble(file_url, headers, ".odt")
full_text = splitter_utils.load_odt_file(temp_path)
result = splitter_simple.load_and_split_text(full_text)
elif extension.lower() == "txt":
temp_path = download_file_from_bubble(file_url, headers, ".txt")
result = TextLoader(temp_path).load()
else:
temp_path = download_file_from_bubble(file_url, headers, ".docx")
result = Docx2txtLoader(temp_path).load()
return result
def download_file_from_bubble(url, headers, extension: str):
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx)
# Save the downloaded file into a temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
with open(temp_file.name, "wb") as f:
f.write(response.content) # por enquanto este arquivo não está sendo excluído
return temp_file.name
|