Spaces:

luanpoppe
/

vella-backend

Running

File size: 3,217 Bytes

cb23311
 
ab79998
 
 
 
 
 
55f46c1
23087eb
 
55f46c1
5cb00b6
55f46c1
6e09bf4
5cb00b6
cb23311
 
 
 
 
 
 
 
 
23087eb
55f46c1
cb23311
55f46c1
cb23311
55f46c1
 
 
 
 
 
 
 
 
 
 
 
6e09bf4
 
ab79998
 
 
 
5cb00b6
 
 
 
 
 
 
668a7d5
 
3462a1d
4ef8d92
668a7d5
6e09bf4
5cb00b6
 
55f46c1
5cb00b6
6e09bf4
 
5cb00b6
6e09bf4
 
 
 
5cb00b6
6e09bf4

# from setup.easy_imports import PyPDFLoader
import os
from langchain_community.document_loaders import (
    PyPDFLoader,
    Docx2txtLoader,
    TextLoader,
    PyMuPDFLoader,
)
import tempfile
import requests

from _utils.handle_files import return_document_list_with_llama_parser
from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils

splitter_utils = SplitterUtils()
splitter_simple = Splitter_Simple()
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}


# def obter_arquivo(id_arquivo="1735864318176x375804955201372160"):
#     return requests.get(
#         f"https://vella.app.br/version-test/api/1.1/obj/formresponseanswer/{id_arquivo}",
#         headers=headers,
#     )


async def get_pdf_from_bubble(
    file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
    should_use_llama_parse=False,
):
    if should_use_llama_parse:
        response = requests.get(file_url, headers=headers)
        response.raise_for_status()

        # Create a NamedTemporaryFile (with a .pdf suffix) to store the file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            # Write the response content to the temporary file
            tmp_file.write(response.content)
            return await return_document_list_with_llama_parser(
                tmp_file.name
            )  # por enquanto este arquivo não está sendo excluído
    else:
        extension = file_url.split(".")[-1]
        if extension.lower() == "pdf":
            try:
                result = PyPDFLoader(file_url, headers=headers).load()
            except:
                result = PyMuPDFLoader(file_url, headers=headers).load()
        elif extension.lower() == "odt":
            temp_path = download_file_from_bubble(file_url, headers, ".odt")
            full_text = splitter_utils.load_odt_file(temp_path)
            result = splitter_simple.load_and_split_text(full_text)
        elif extension.lower() == "txt":
            temp_path = download_file_from_bubble(file_url, headers, ".txt")
            result = TextLoader(temp_path).load()
        elif extension.lower() == "doc":
            temp_path = download_file_from_bubble(file_url, headers, ".doc")
            # full_text_binary = textract.process(temp_path)
            full_text = splitter_utils.getTextFromDotDoc(temp_path)
            result = splitter_simple.load_and_split_text(full_text)
        else:
            temp_path = download_file_from_bubble(file_url, headers, ".docx")
            result = Docx2txtLoader(temp_path).load()

        return result


def download_file_from_bubble(url, headers, extension: str):
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an exception for bad responses (status codes 4xx or 5xx)

    # Save the downloaded file into a temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
    with open(temp_file.name, "wb") as f:
        f.write(response.content)  # por enquanto este arquivo não está sendo excluído

    return temp_file.name