# from setup.easy_imports import PyPDFLoader import os from langchain_community.document_loaders import ( PyPDFLoader, Docx2txtLoader, TextLoader, PyMuPDFLoader, ) import tempfile import requests from _utils.handle_files import return_document_list_with_llama_parser from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils splitter_utils = SplitterUtils() splitter_simple = Splitter_Simple() headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"} # def obter_arquivo(id_arquivo="1735864318176x375804955201372160"): # return requests.get( # f"https://vella.app.br/version-test/api/1.1/obj/formresponseanswer/{id_arquivo}", # headers=headers, # ) async def get_pdf_from_bubble( file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf", should_use_llama_parse=False, ): if should_use_llama_parse: response = requests.get(file_url, headers=headers) response.raise_for_status() # Create a NamedTemporaryFile (with a .pdf suffix) to store the file with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: # Write the response content to the temporary file tmp_file.write(response.content) return await return_document_list_with_llama_parser( tmp_file.name ) # por enquanto este arquivo não está sendo excluído else: extension = file_url.split(".")[-1] if extension.lower() == "pdf": try: result = PyPDFLoader(file_url, headers=headers).load() except: result = PyMuPDFLoader(file_url, headers=headers).load() elif extension.lower() == "odt": temp_path = download_file_from_bubble(file_url, headers, ".odt") full_text = splitter_utils.load_odt_file(temp_path) result = splitter_simple.load_and_split_text(full_text) elif extension.lower() == "txt": temp_path = download_file_from_bubble(file_url, headers, ".txt") result = TextLoader(temp_path).load() elif extension.lower() == "doc": temp_path = download_file_from_bubble(file_url, headers, ".doc") # full_text_binary = textract.process(temp_path) full_text = splitter_utils.getTextFromDotDoc(temp_path) result = splitter_simple.load_and_split_text(full_text) else: temp_path = download_file_from_bubble(file_url, headers, ".docx") result = Docx2txtLoader(temp_path).load() return result def download_file_from_bubble(url, headers, extension: str): response = requests.get(url, headers=headers) response.raise_for_status() # Raise an exception for bad responses (status codes 4xx or 5xx) # Save the downloaded file into a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension) with open(temp_file.name, "wb") as f: f.write(response.content) # por enquanto este arquivo não está sendo excluído return temp_file.name