File size: 2,752 Bytes
cb23311
 
5cb00b6
55f46c1
23087eb
 
55f46c1
5cb00b6
55f46c1
6e09bf4
5cb00b6
cb23311
 
 
 
 
 
 
 
 
23087eb
55f46c1
cb23311
55f46c1
cb23311
55f46c1
 
 
 
 
 
 
 
 
 
 
 
6e09bf4
 
5cb00b6
 
 
 
 
 
 
 
6e09bf4
5cb00b6
 
55f46c1
5cb00b6
6e09bf4
 
5cb00b6
6e09bf4
 
 
 
5cb00b6
6e09bf4
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# from setup.easy_imports import PyPDFLoader
import os
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
import tempfile
import requests

from _utils.handle_files import return_document_list_with_llama_parser
from _utils.langchain_utils.splitter_util import Splitter_Simple, SplitterUtils

splitter_utils = SplitterUtils()
splitter_simple = Splitter_Simple()
headers = {"Authorization": f"Bearer {os.environ.get("BUBBLE_TOKEN")}"}


# def obter_arquivo(id_arquivo="1735864318176x375804955201372160"):
#     return requests.get(
#         f"https://vella.app.br/version-test/api/1.1/obj/formresponseanswer/{id_arquivo}",
#         headers=headers,
#     )


async def get_pdf_from_bubble(
    file_url=f"https://vella.app.br/version-test/fileupload/f1735864316650x718601440484441900/Boleto_DIGITICS%20Servic%CC%A7os%20de%20Secretariado%20LTDA_30_12_2024_804841714.pdf",
    should_use_llama_parse=False,
):
    if should_use_llama_parse:
        response = requests.get(file_url, headers=headers)
        response.raise_for_status()

        # Create a NamedTemporaryFile (with a .pdf suffix) to store the file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
            # Write the response content to the temporary file
            tmp_file.write(response.content)
            return await return_document_list_with_llama_parser(
                tmp_file.name
            )  # por enquanto este arquivo não está sendo excluído
    else:
        extension = file_url.split(".")[-1]
        if extension.lower() == "pdf":
            result = PyPDFLoader(file_url, headers=headers).load()
        elif extension.lower() == "odt":
            temp_path = download_file_from_bubble(file_url, headers, ".odt")
            full_text = splitter_utils.load_odt_file(temp_path)
            result = splitter_simple.load_and_split_text(full_text)
        elif extension.lower() == "txt":
            temp_path = download_file_from_bubble(file_url, headers, ".txt")
            result = TextLoader(temp_path).load()
        else:
            temp_path = download_file_from_bubble(file_url, headers, ".docx")
            result = Docx2txtLoader(temp_path).load()

        return result


def download_file_from_bubble(url, headers, extension: str):
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raise an exception for bad responses (status codes 4xx or 5xx)

    # Save the downloaded file into a temporary file
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=extension)
    with open(temp_file.name, "wb") as f:
        f.write(response.content)  # por enquanto este arquivo não está sendo excluído

    return temp_file.name