luanpoppe commited on
Commit
fa506ce
·
2 Parent(s): 4f6fe00 01a4e83

Merge branch 'tests' of https://github.com/luanpoppe/vella-backend

Browse files
.env.example CHANGED
@@ -11,4 +11,6 @@ LLAMA_CLOUD_API_KEY_PEIXE=""
11
  DEEPSEEKK_API_KEY=""
12
  GOOGLE_API_KEY_PEIXE=""
13
  SENTRY_DSN=""
14
- AMBIENTE="testes"
 
 
 
11
  DEEPSEEKK_API_KEY=""
12
  GOOGLE_API_KEY_PEIXE=""
13
  SENTRY_DSN=""
14
+ AMBIENTE="testes"
15
+ GOOGLE_APPLICATION_CREDENTIALS="" # Só é necessário em ambiente de desenvolvimento que não esteja usando docker
16
+ GCP_CREDENTIALS_JSON_CONTENT="Conteúdo inteiro do arquivo vella_gcp_luan_credentials.json" # Em produção, tem que conter todo o conteúdo do arquivo de credentials. Localmente, não precisa existir
.gitignore CHANGED
@@ -173,5 +173,4 @@ cython_debug/
173
  #.idea/
174
 
175
  # End of https://www.toptal.com/developers/gitignore/api/django
176
-
177
  vella_gcp_luan_credentials.json
 
173
  #.idea/
174
 
175
  # End of https://www.toptal.com/developers/gitignore/api/django
 
176
  vella_gcp_luan_credentials.json
Dockerfile CHANGED
@@ -3,6 +3,10 @@ FROM python:3.12
3
  # Instalação necessária para converter arquivos .doc
4
  RUN apt-get update && apt-get install -y antiword
5
 
 
 
 
 
6
  RUN useradd -m -u 1000 user
7
  USER user
8
  ENV PATH="/home/user/.local/bin:$PATH"
@@ -23,6 +27,10 @@ RUN pip install --no-cache-dir -r requirements.txt
23
  RUN python manage.py collectstatic --noinput
24
 
25
  RUN pip install uvicorn
 
 
 
 
26
  CMD ["uvicorn", "setup.asgi:application", "--host", "0.0.0.0", "--port", "7860"]
27
 
28
  # ENTRYPOINT ["python", "manage.py", "runserver"]
 
3
  # Instalação necessária para converter arquivos .doc
4
  RUN apt-get update && apt-get install -y antiword
5
 
6
+ # Copy the entrypoint script and make it executable
7
+ COPY entrypoint.sh /entrypoint.sh
8
+ RUN chmod +x /entrypoint.sh
9
+
10
  RUN useradd -m -u 1000 user
11
  USER user
12
  ENV PATH="/home/user/.local/bin:$PATH"
 
27
  RUN python manage.py collectstatic --noinput
28
 
29
  RUN pip install uvicorn
30
+
31
+ # Set the entrypoint to our script
32
+ ENTRYPOINT ["/entrypoint.sh"]
33
+
34
  CMD ["uvicorn", "setup.asgi:application", "--host", "0.0.0.0", "--port", "7860"]
35
 
36
  # ENTRYPOINT ["python", "manage.py", "runserver"]
_utils/Handle_Files_Class.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Tuple
3
+
4
+ from _utils.langchain_utils.Splitter_class import Splitter
5
+ from _utils.models.gerar_documento import DocumentChunk
6
+
7
+
8
+ @dataclass
9
+ class HandleFilesClass:
10
+ async def get_full_text_and_all_PDFs_chunks(
11
+ self,
12
+ listaPDFs: List[str],
13
+ splitterObject: Splitter,
14
+ should_use_llama_parse: bool,
15
+ isBubble: bool,
16
+ ) -> Tuple[List[DocumentChunk], List[str]]:
17
+ all_PDFs_chunks: List[DocumentChunk] = []
18
+
19
+ pages: List[str] = []
20
+
21
+ # Load and process document
22
+ for pdf_path in listaPDFs:
23
+ chunks, pages = await splitterObject.load_and_split_document(
24
+ pdf_path, should_use_llama_parse, isBubble
25
+ )
26
+ all_PDFs_chunks = all_PDFs_chunks + chunks
27
+
28
+ return all_PDFs_chunks, pages
_utils/Utils_Class.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List
3
+
4
+ from _utils.Handle_Files_Class import HandleFilesClass
5
+
6
+
7
+ @dataclass
8
+ class UtilsClass:
9
+ lista_pdfs: List[str] | None = None
10
+ handle_files = HandleFilesClass()
_utils/axiom_logs.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ from setup.logging import Axiom
4
+
5
+
6
+ @dataclass
7
+ class AxiomLogs:
8
+ axiom: Axiom
9
+
10
+ def texto_completo_pdf(self, full_text_as_array):
11
+ self.axiom.send_axiom(
12
+ f"INÍCIO DO TEXTO COMPLETO DOS PDFS: {full_text_as_array[0:5]}"
13
+ )
14
+
15
+ def resumo_inicial_processo(self, response_auxiliar_summary):
16
+ self.axiom.send_axiom(
17
+ f"RESUMO INICIAL DO PROCESSO: {response_auxiliar_summary}"
18
+ )
19
+
20
+ def inicio_requisicao_contextual(self):
21
+ self.axiom.send_axiom("COMEÇANDO A FAZER AS REQUISIÇÕES DO CONTEXTUAL")
22
+
23
+ def fim_requisicao_contextual(self):
24
+ self.axiom.send_axiom("TERMINOU DE FAZER TODAS AS REQUISIÇÕES DO CONTEXTUAL")
25
+
26
+ def chunks_inicialmente(self, chunks_processados):
27
+ self.axiom.send_axiom(f"CHUNKS PROCESSADOS INICIALMENTE: {chunks_processados}")
_utils/custom_exception_handler.py CHANGED
@@ -44,7 +44,7 @@ def custom_exception_handler(exc, context):
44
  return response
45
 
46
 
47
- def custom_exception_handler_wihout_api_handler(
48
  error, serializer: Union[GerarDocumentoSerializerData, Any], axiom_instace: Axiom
49
  ):
50
  bahia_tz = pytz.timezone("America/Bahia")
 
44
  return response
45
 
46
 
47
+ def custom_exception_handler_without_api_handler(
48
  error, serializer: Union[GerarDocumentoSerializerData, Any], axiom_instace: Axiom
49
  ):
50
  bahia_tz = pytz.timezone("America/Bahia")
_utils/files/ragas_testset.csv DELETED
@@ -1,18 +0,0 @@
1
- position,user_input,reference,response,retrieved_contexts
2
- 0,What are the global implications of the USA Supreme Court ruling on abortion?,"The global implications of the USA Supreme Court ruling on abortion are significant. The ruling has led to limited or no access to abortion for one in three women and girls of reproductive age in states where abortion access is restricted. These states also have weaker maternal health support, higher maternal death rates, and higher child poverty rates. Additionally, the ruling has had an impact beyond national borders due to the USA's geopolitical and cultural influence globally. Organizations and activists worldwide are concerned that the ruling may inspire anti-abortion legislative and policy attacks in other countries. The ruling has also hindered progressive law reform and the implementation of abortion guidelines in certain African countries. Furthermore, the ruling has created a chilling effect in international policy spaces, empowering anti-abortion actors to undermine human rights protections.","The global implications of the USA Supreme Court ruling on abortion can be significant, as it sets a precedent for other countries and influences the global discourse on reproductive rights. Here are some potential implications:
3
-
4
- 1. Influence on other countries: The Supreme Court's ruling can serve as a reference point for other countries grappling with their own abortion laws. It can provide legal arguments and reasoning that advocates for reproductive rights can use to challenge restrictive abortion laws in their respective jurisdictions.
5
-
6
- 2. Strengthening of global reproductive rights movements: A favorable ruling by the Supreme Court can energize and empower reproductive rights movements worldwide. It can serve as a rallying point for activists and organizations advocating for women's rights, leading to increased mobilization and advocacy efforts globally.
7
-
8
- 3. Counteracting anti-abortion movements: Conversely, a ruling that restricts abortion rights can embolden anti-abortion movements globally. It can provide legitimacy to their arguments and encourage similar restrictive measures in other countries, potentially leading to a rollback of existing reproductive rights.
9
-
10
- 4. Impact on international aid and policies: The Supreme Court's ruling can influence international aid and policies related to reproductive health. It can shape the priorities and funding decisions of donor countries and organizations, potentially leading to increased support for reproductive rights initiatives or conversely, restrictions on funding for abortion-related services.
11
-
12
- 5. Shaping international human rights standards: The ruling can contribute to the development of international human rights standards regarding reproductive rights. It can influence the interpretation and application of existing human rights treaties and conventions, potentially strengthening the recognition of reproductive rights as fundamental human rights globally.
13
-
14
- 6. Global health implications: The Supreme Court's ruling can have implications for global health outcomes, particularly in countries with restrictive abortion laws. It can impact the availability and accessibility of safe and legal abortion services, potentially leading to an increase in unsafe abortions and related health complications.
15
-
16
- It is important to note that the specific implications will depend on the nature of the Supreme Court ruling and the subsequent actions taken by governments, activists, and organizations both within and outside the United States.","[""- In 2022, the USA Supreme Court handed down a decision ruling that overturned 50 years of jurisprudence recognizing a constitutional right to abortion.\n- This decision has had a massive impact: one in three women and girls of reproductive age now live in states where abortion access is either totally or near-totally inaccessible.\n- The states with the most restrictive abortion laws have the weakest maternal health support, higher maternal death rates, and higher child poverty rates.\n- The USA Supreme Court ruling has also had impacts beyond national borders due to the geopolitical and cultural influence wielded by the USA globally and the aid it funds.\n- SRR organizations and activists across the world have expressed fear about the ruling laying the groundwork for anti-abortion legislative and policy attacks in other countries.\n- Advocates have also observed the ruling's impact on progressive law reform and the stalling of the adoption and enforcement of abortion guidelines in certain African countries.\n- The ruling has created a chilling effect in international policy spaces, emboldening anti-abortion state and non-state actors to undermine human rights protections.""
17
- 'The USA Supreme Court ruling on abortion has sparked intense debates and discussions not only within the country but also around the world. Many countries look to the United States as a leader in legal and social issues, so the decision could potentially influence the policies and attitudes towards abortion in other nations.'
18
- ""The ruling may also impact international organizations and non-governmental groups that work on reproductive rights and women's health issues. Depending on the outcome, there could be shifts in funding, advocacy efforts, and collaborations with American counterparts, leading to ripple effects in the global fight for reproductive justice.""]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
_utils/gerar_documento.py CHANGED
@@ -1,31 +1,11 @@
1
  import os
2
- from langchain_core.messages import HumanMessage
3
- from typing import Any, Union, cast
4
- from _utils.langchain_utils.LLM_class import LLM
5
- from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_final
6
- from _utils.custom_exception_handler import custom_exception_handler_wihout_api_handler
7
- from _utils.gerar_documento_utils.prompts import (
8
- prompt_gerar_query_dinamicamente,
9
- )
10
  from _utils.gerar_documento_utils.GerarDocumento import (
11
  GerarDocumento,
12
  )
13
- from _utils.gerar_documento_utils.contextual_retriever import (
14
- ContextualRetriever,
15
- )
16
- from _utils.gerar_documento_utils.utils import (
17
- generate_document_title,
18
- gerar_resposta_compilada,
19
- get_full_text_and_all_PDFs_chunks,
20
- get_response_from_auxiliar_contextual_prompt,
21
- )
22
- from _utils.models.gerar_documento import (
23
- RetrievalConfig,
24
- )
25
- import markdown
26
 
27
- from _utils.langchain_utils.Prompt_class import Prompt
28
- from _utils.utils import convert_markdown_to_HTML
29
  from gerar_documento.serializer import (
30
  GerarDocumentoComPDFProprioSerializer,
31
  GerarDocumentoComPDFProprioSerializerData,
@@ -47,130 +27,39 @@ async def gerar_documento(
47
  listaPDFs,
48
  axiom_instance: Axiom,
49
  isBubble=False,
50
- ):
51
  try:
52
- contextual_retriever = ContextualRetriever(serializer)
53
-
54
- # Initialize enhanced summarizer
55
- summarizer = GerarDocumento(serializer, axiom_instance)
56
-
57
- all_PDFs_chunks, full_text_as_array = await get_full_text_and_all_PDFs_chunks(
58
- listaPDFs,
59
- summarizer.splitter,
60
- serializer.should_use_llama_parse,
61
- isBubble,
62
- )
63
- axiom_instance.send_axiom(
64
- f"INÍCIO DO TEXTO COMPLETO DOS PDFS: {full_text_as_array[0:5]}"
65
- )
66
-
67
- is_contextualized_chunk = serializer.should_have_contextual_chunks
68
-
69
- if is_contextualized_chunk:
70
- response_auxiliar_summary = (
71
- await get_response_from_auxiliar_contextual_prompt(full_text_as_array)
72
- )
73
- axiom_instance.send_axiom(
74
- f"RESUMO INICIAL DO PROCESSO: {response_auxiliar_summary}"
75
- )
76
-
77
- axiom_instance.send_axiom("COMEÇANDO A FAZER AS REQUISIÇÕES DO CONTEXTUAL")
78
- contextualized_chunks = await contextual_retriever.contextualize_all_chunks(
79
- all_PDFs_chunks, response_auxiliar_summary, axiom_instance
80
- )
81
- axiom_instance.send_axiom(
82
- "TERMINOU DE FAZER TODAS AS REQUISIÇÕES DO CONTEXTUAL"
83
- )
84
- chunks_processados = contextualized_chunks
85
- axiom_instance.send_axiom(
86
- f"CHUNKS PROCESSADOS INICIALMENTE: {chunks_processados}"
87
- )
88
- else:
89
- chunks_processados = all_PDFs_chunks
90
-
91
- llm = LLM()
92
- prompt_para_gerar_query_dinamico = prompt_gerar_query_dinamicamente(
93
- cast(str, response_auxiliar_summary)
94
- )
95
-
96
- axiom_instance.send_axiom(
97
- "COMEÇANDO REQUISIÇÃO PARA GERAR O QUERY DINAMICAMENTE DO VECTOR STORE"
98
- )
99
- query_gerado_dinamicamente_para_o_vector_store = (
100
- await llm.google_gemini_ainvoke(
101
- prompt_para_gerar_query_dinamico, "gemini-2.0-flash"
102
- )
103
- )
104
-
105
- axiom_instance.send_axiom(
106
- f"query_gerado_dinamicamente_para_o_vector_store: {query_gerado_dinamicamente_para_o_vector_store.content}",
107
- )
108
-
109
- # Create enhanced vector store and BM25 index
110
- vector_store, bm25, chunk_ids = (
111
- summarizer.vector_store.create_enhanced_vector_store(
112
- chunks_processados, is_contextualized_chunk, axiom_instance
113
- )
114
- )
115
-
116
- llm_ultimas_requests = serializer.llm_ultimas_requests
117
- axiom_instance.send_axiom("COMEÇANDO A FAZER ÚLTIMA REQUISIÇÃO")
118
- structured_summaries = await summarizer.gerar_documento_final(
119
- vector_store,
120
- bm25,
121
- chunk_ids,
122
- llm_ultimas_requests,
123
- cast(
124
- str, query_gerado_dinamicamente_para_o_vector_store.content
125
- ), # prompt_auxiliar_SEM_CONTEXT,
126
- )
127
- axiom_instance.send_axiom("TERMINOU DE FAZER A ÚLTIMA REQUISIÇÃO")
128
 
129
- if not isinstance(structured_summaries, list):
130
- from rest_framework.response import Response
131
 
132
- return Response({"erro": structured_summaries})
133
 
134
- texto_completo = summarizer.resumo_gerado + "\n\n"
135
 
136
- for x in structured_summaries:
137
- texto_completo = texto_completo + x["content"] + "\n"
138
- x["source"]["text"] = x["source"]["text"][0:200]
139
- x["source"]["context"] = x["source"]["context"][0:200]
 
 
 
 
140
 
141
- texto_completo_como_html = convert_markdown_to_HTML(texto_completo).replace(
142
- "resposta_segunda_etapa:", "<br><br>"
143
- )
144
- axiom_instance.send_axiom(
145
- f"texto_completo_como_html: {texto_completo_como_html}"
146
- )
147
 
148
- if is_contextualized_chunk:
149
- prompt_titulo_do_documento = response_auxiliar_summary
150
- else:
151
- prompt_titulo_do_documento = texto_completo_como_html
152
- titulo_do_documento = await generate_document_title(
153
- cast(str, prompt_titulo_do_documento)
154
- )
155
 
156
  if isBubble:
157
- axiom_instance.send_axiom("COMEÇANDO A REQUISIÇÃO FINAL PARA O BUBBLE")
158
- enviar_resposta_final(
159
- serializer.doc_id, # type: ignore
160
- serializer.form_response_id, # type: ignore
161
- serializer.version, # type: ignore
162
- texto_completo_como_html,
163
- False,
164
- cast(str, titulo_do_documento),
165
- )
166
- axiom_instance.send_axiom("TERMINOU A REQUISIÇÃO FINAL PARA O BUBBLE")
167
 
168
  return {
169
- "texto_completo": texto_completo_como_html,
170
- "titulo_do_documento": titulo_do_documento,
171
  "resultado": structured_summaries,
172
- "parametros-utilizados": gerar_resposta_compilada(serializer),
173
  }
174
  except Exception as e:
175
- custom_exception_handler_wihout_api_handler(e, serializer, axiom_instance)
176
  raise
 
1
  import os
2
+ from typing import Any, Union
3
+ from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
4
+ from rest_framework.response import Response
 
 
 
 
 
5
  from _utils.gerar_documento_utils.GerarDocumento import (
6
  GerarDocumento,
7
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
9
  from gerar_documento.serializer import (
10
  GerarDocumentoComPDFProprioSerializer,
11
  GerarDocumentoComPDFProprioSerializerData,
 
27
  listaPDFs,
28
  axiom_instance: Axiom,
29
  isBubble=False,
30
+ ) -> Response | dict[str, Any]:
31
  try:
32
+ summarizer = GerarDocumento(serializer, isBubble, axiom_instance)
33
+ summarizer.lista_pdfs = listaPDFs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ await summarizer.get_text_and_pdf_chunks()
 
36
 
37
+ await summarizer.get_response_from_auxiliar_contextual_prompt()
38
 
39
+ await summarizer.generate_chunks_processados()
40
 
41
+ await summarizer.generate_query_for_vector_store()
42
+
43
+ await summarizer.create_enhanced_vector_store()
44
+
45
+ structured_summaries = await summarizer.do_last_requests()
46
+
47
+ if not isinstance(structured_summaries, list):
48
+ return Response({"erro": structured_summaries})
49
 
50
+ await summarizer.generate_complete_text()
 
 
 
 
 
51
 
52
+ await summarizer.get_document_title()
 
 
 
 
 
 
53
 
54
  if isBubble:
55
+ await summarizer.send_to_bubble()
 
 
 
 
 
 
 
 
 
56
 
57
  return {
58
+ "texto_completo": summarizer.texto_completo_como_html,
59
+ "titulo_do_documento": summarizer.titulo_do_documento,
60
  "resultado": structured_summaries,
61
+ "parametros-utilizados": summarizer.gerar_resposta_compilada(),
62
  }
63
  except Exception as e:
64
+ custom_exception_handler_without_api_handler(e, serializer, axiom_instance)
65
  raise
_utils/gerar_documento_utils/GerarDocumento.py CHANGED
@@ -1,12 +1,23 @@
1
  from dataclasses import dataclass
2
- import os
3
  from typing import Any, List, Dict, Literal, Tuple, Optional, Union, cast
4
 
5
  from pydantic import SecretStr
 
 
 
 
 
 
 
 
 
 
6
  from _utils.langchain_utils.Chain_class import Chain
7
- from _utils.langchain_utils.LLM_class import LLM
8
  from _utils.langchain_utils.Prompt_class import Prompt
9
  from _utils.langchain_utils.Vector_store_class import VectorStore
 
10
  from gerar_documento.serializer import (
11
  GerarDocumentoComPDFProprioSerializerData,
12
  GerarDocumentoSerializerData,
@@ -21,13 +32,17 @@ from setup.easy_imports import (
21
  )
22
  import logging
23
  from _utils.models.gerar_documento import (
 
 
24
  RetrievalConfig,
25
  )
26
  from cohere import Client
27
  from _utils.langchain_utils.Splitter_class import Splitter
28
  import time
29
-
30
  from setup.logging import Axiom
 
 
31
 
32
 
33
  def reciprocal_rank_fusion(result_lists, weights=None):
@@ -51,6 +66,10 @@ def reciprocal_rank_fusion(result_lists, weights=None):
51
 
52
  @dataclass
53
  class GerarDocumentoUtils:
 
 
 
 
54
  def criar_output_estruturado(self, summaries: List[str | Any], sources: Any):
55
  structured_output = []
56
  for idx, summary in enumerate(summaries):
@@ -81,35 +100,131 @@ class GerarDocumentoUtils:
81
  else:
82
  return documento_gerado
83
 
84
-
85
- class GerarDocumento:
86
- openai_api_key = os.environ.get("OPENAI_API_KEY", "")
87
- cohere_api_key = os.environ.get("COHERE_API_KEY", "")
88
- resumo_gerado = ""
89
- gerar_documento_utils = GerarDocumentoUtils()
90
-
91
- def __init__(
92
  self,
93
  serializer: Union[
94
  GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
95
  ],
96
- axiom_instance: Axiom,
97
  ):
98
- self.config = RetrievalConfig(
99
  num_chunks=serializer.num_chunks_retrieval,
100
  embedding_weight=serializer.embedding_weight,
101
  bm25_weight=serializer.bm25_weight,
102
  context_window=serializer.context_window,
103
  chunk_overlap=serializer.chunk_overlap,
104
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  self.logger = logging.getLogger(__name__)
106
  # self.prompt_auxiliar = prompt_auxiliar
107
  self.gpt_model = serializer.model
108
- self.gpt_temperature = serializer.gpt_temperature
109
  self.prompt_gerar_documento = serializer.prompt_gerar_documento
 
 
 
 
 
110
 
111
- self.openai_api_key = self.openai_api_key
112
- self.cohere_client = Client(self.cohere_api_key)
113
  self.embeddings = HuggingFaceEmbeddings(model_name=serializer.hf_embedding)
114
  self.num_k_rerank = serializer.num_k_rerank
115
  self.model_cohere_rerank = serializer.model_cohere_rerank
@@ -119,6 +234,75 @@ class GerarDocumento:
119
 
120
  self.vector_store = VectorStore(serializer.hf_embedding)
121
  self.axiom_instance: Axiom = axiom_instance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  def retrieve_with_rank_fusion(
124
  self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
@@ -208,40 +392,18 @@ class GerarDocumento:
208
 
209
  return sources, contexts
210
 
211
- def select_model_for_last_requests(
212
- self,
213
- llm_ultimas_requests: Literal[
214
- "gpt-4o-mini", "deepseek-chat", "gemini-2.0-flash", "gemini-2.5-pro"
215
- ],
216
- ):
217
- llm_instance = LLM()
218
- if llm_ultimas_requests == "gpt-4o-mini":
219
- llm = ChatOpenAI(
220
- temperature=self.gpt_temperature,
221
- model=self.gpt_model,
222
- api_key=SecretStr(self.openai_api_key),
223
- )
224
- elif llm_ultimas_requests == "deepseek-chat":
225
- llm = llm_instance.deepseek()
226
- elif llm_ultimas_requests == "gemini-2.0-flash":
227
- llm = llm_instance.google_gemini("gemini-2.0-flash")
228
- elif llm_ultimas_requests == "gemini-2.5-pro":
229
- llm = llm_instance.google_gemini("gemini-2.5-pro-preview-05-06")
230
- elif llm_ultimas_requests == "gemini-2.5-flash":
231
- llm = llm_instance.google_gemini("gemini-2.5-flash-preview-04-17")
232
- return llm
233
-
234
- async def gerar_documento_final(
235
  self,
236
- vector_store: Chroma,
237
- bm25: BM25Okapi,
238
- chunk_ids: List[str],
239
- llm_ultimas_requests: str,
240
- query: str = "Summarize the main points of this document",
241
  ) -> List[Dict]:
242
  try:
 
 
 
243
  sources, contexts = self.rank_fusion_get_top_results(
244
- vector_store, bm25, chunk_ids, query
 
 
 
245
  )
246
 
247
  prompt_gerar_documento = PromptTemplate(
@@ -249,14 +411,16 @@ class GerarDocumento:
249
  input_variables=["context"],
250
  )
251
 
252
- llm = self.select_model_for_last_requests(llm_ultimas_requests) # type: ignore
253
  prompt_instance = Prompt()
254
  context_do_prompt_primeira_etapa = "\n\n".join(contexts)
255
  prompt_primeira_etapa = prompt_gerar_documento.format(
256
  context=context_do_prompt_primeira_etapa,
257
  )
258
 
259
- documento_gerado = await self.checar_se_resposta_vazia_do_documento_final(
 
 
260
  llm_ultimas_requests, prompt_primeira_etapa
261
  )
262
 
@@ -274,7 +438,7 @@ class GerarDocumento:
274
  dynamic_dict={"context": context_do_prompt_primeira_etapa},
275
  )
276
  # documento_gerado = llm.invoke(prompt_etapa_2).content
277
- documento_gerado = self.checar_se_resposta_vazia_do_documento_final(
278
  llm_ultimas_requests, prompt_etapa_2.to_string()
279
  )
280
  resposta_segunda_etapa = documento_gerado
@@ -292,7 +456,7 @@ class GerarDocumento:
292
  },
293
  )
294
  # documento_gerado = llm.invoke(prompt_etapa_3).content
295
- documento_gerado = self.checar_se_resposta_vazia_do_documento_final(
296
  llm_ultimas_requests, prompt_etapa_3.to_string()
297
  )
298
  texto_final_juntando_as_etapas += f"\n\n{documento_gerado}"
@@ -306,49 +470,57 @@ class GerarDocumento:
306
  structured_output = self.gerar_documento_utils.criar_output_estruturado(
307
  summaries, sources
308
  )
 
 
 
309
  return structured_output
310
 
311
  except Exception as e:
312
  self.logger.error(f"Error generating enhanced summary: {str(e)}")
313
  raise
314
 
315
- async def checar_se_resposta_vazia_do_documento_final(
316
- self, llm_ultimas_requests: str, prompt: str
317
- ):
318
- llm = self.select_model_for_last_requests(llm_ultimas_requests) # type: ignore
319
- documento_gerado = ""
320
- tentativas = 0
321
 
322
- while tentativas < 5 and not documento_gerado:
323
- tentativas += 1
324
- try:
325
- resposta = llm.invoke(prompt)
326
- if hasattr(resposta, "content") and resposta.content.strip(): # type: ignore
327
- if isinstance(resposta.content, list):
328
- resposta.content = "\n".join(resposta.content) # type: ignore
329
 
330
- documento_gerado = resposta.content.strip() # type: ignore
331
- else:
332
- print(f"Tentativa {tentativas}: resposta vazia ou inexistente.")
333
- except Exception as e:
334
- llm = self.select_model_for_last_requests("gemini-2.0-flash")
335
- print(f"Tentativa {tentativas}: erro ao invocar o modelo: {e}")
336
- time.sleep(5)
337
 
338
- if not documento_gerado:
339
- try:
340
- self.axiom_instance.send_axiom(
341
- "TENTANDO GERAR DOCUMENTO FINAL COM GPT 4o-mini COMO ÚLTIMA TENTATIVA"
342
- )
343
- documento_gerado = (
344
- self.gerar_documento_utils.ultima_tentativa_requisicao(prompt)
345
- )
346
- except Exception as e:
347
- raise Exception(
348
- "Falha ao gerar o documento final na última tentativa."
349
- ) from e
350
 
351
- return documento_gerado
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  async def gerar_ementa_final(
354
  self,
@@ -357,10 +529,10 @@ class GerarDocumento:
357
  context_primeiro_prompt: str,
358
  ):
359
 
360
- llm = self.select_model_for_last_requests(llm_ultimas_requests) # type: ignore
361
  prompt_instance = Prompt()
362
 
363
- documento_gerado = await self.checar_se_resposta_vazia_do_documento_final(
364
  llm_ultimas_requests, prompt_primeira_etapa
365
  )
366
 
@@ -395,3 +567,77 @@ class GerarDocumento:
395
  self.axiom_instance.send_axiom(f"RESULTADO ETAPA 3: {documento_gerado}")
396
 
397
  return texto_final_juntando_as_etapas
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from dataclasses import dataclass
2
+ from langchain_core.messages import HumanMessage
3
  from typing import Any, List, Dict, Literal, Tuple, Optional, Union, cast
4
 
5
  from pydantic import SecretStr
6
+ from _utils.Utils_Class import UtilsClass
7
+ from _utils.axiom_logs import AxiomLogs
8
+ from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_final
9
+ from _utils.gerar_documento_utils.contextual_retriever import ContextualRetriever
10
+ from _utils.gerar_documento_utils.llm_calls import agemini_answer
11
+ from _utils.gerar_documento_utils.prompts import (
12
+ create_prompt_auxiliar_do_contextual_prompt,
13
+ prompt_gerar_query_dinamicamente,
14
+ prompt_para_gerar_titulo,
15
+ )
16
  from _utils.langchain_utils.Chain_class import Chain
17
+ from _utils.langchain_utils.LLM_class import LLM, Google_llms
18
  from _utils.langchain_utils.Prompt_class import Prompt
19
  from _utils.langchain_utils.Vector_store_class import VectorStore
20
+ from _utils.utils import convert_markdown_to_HTML
21
  from gerar_documento.serializer import (
22
  GerarDocumentoComPDFProprioSerializerData,
23
  GerarDocumentoSerializerData,
 
32
  )
33
  import logging
34
  from _utils.models.gerar_documento import (
35
+ ContextualizedChunk,
36
+ DocumentChunk,
37
  RetrievalConfig,
38
  )
39
  from cohere import Client
40
  from _utils.langchain_utils.Splitter_class import Splitter
41
  import time
42
+ from setup.tokens import openai_api_key, cohere_api_key
43
  from setup.logging import Axiom
44
+ import tiktoken
45
+ from setup.environment import default_model
46
 
47
 
48
  def reciprocal_rank_fusion(result_lists, weights=None):
 
66
 
67
  @dataclass
68
  class GerarDocumentoUtils:
69
+ axiom_instance: Axiom
70
+ temperature = 0.0
71
+ model = default_model
72
+
73
  def criar_output_estruturado(self, summaries: List[str | Any], sources: Any):
74
  structured_output = []
75
  for idx, summary in enumerate(summaries):
 
100
  else:
101
  return documento_gerado
102
 
103
+ def create_retrieval_config(
 
 
 
 
 
 
 
104
  self,
105
  serializer: Union[
106
  GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
107
  ],
 
108
  ):
109
+ return RetrievalConfig(
110
  num_chunks=serializer.num_chunks_retrieval,
111
  embedding_weight=serializer.embedding_weight,
112
  bm25_weight=serializer.bm25_weight,
113
  context_window=serializer.context_window,
114
  chunk_overlap=serializer.chunk_overlap,
115
  )
116
+
117
+ async def checar_se_resposta_vazia_do_documento_final(
118
+ self, llm_ultimas_requests: str, prompt: str
119
+ ):
120
+ llm = self.select_model_for_last_requests(llm_ultimas_requests) # type: ignore
121
+ documento_gerado = ""
122
+ tentativas = 0
123
+
124
+ while tentativas < 5 and not documento_gerado:
125
+ tentativas += 1
126
+ try:
127
+ resposta = llm.invoke(prompt)
128
+ if hasattr(resposta, "content") and resposta.content.strip(): # type: ignore
129
+ if isinstance(resposta.content, list):
130
+ resposta.content = "\n".join(resposta.content) # type: ignore
131
+
132
+ documento_gerado = resposta.content.strip() # type: ignore
133
+ else:
134
+ print(f"Tentativa {tentativas}: resposta vazia ou inexistente.")
135
+ except Exception as e:
136
+ llm = self.select_model_for_last_requests("gemini-2.0-flash")
137
+ print(f"Tentativa {tentativas}: erro ao invocar o modelo: {e}")
138
+ time.sleep(5)
139
+
140
+ if not documento_gerado:
141
+ try:
142
+ self.axiom_instance.send_axiom(
143
+ "TENTANDO GERAR DOCUMENTO FINAL COM GPT 4o-mini COMO ÚLTIMA TENTATIVA"
144
+ )
145
+ documento_gerado = self.ultima_tentativa_requisicao(prompt)
146
+ except Exception as e:
147
+ raise Exception(
148
+ "Falha ao gerar o documento final na última tentativa."
149
+ ) from e
150
+
151
+ return documento_gerado
152
+
153
+ def select_model_for_last_requests(
154
+ self,
155
+ llm_ultimas_requests: Literal[
156
+ "gpt-4o-mini", "deepseek-chat", "gemini-2.0-flash", "gemini-2.5-pro"
157
+ ],
158
+ ):
159
+ llm_instance = LLM()
160
+ if llm_ultimas_requests == "gpt-4o-mini":
161
+ llm = ChatOpenAI(
162
+ temperature=self.temperature,
163
+ model=self.model,
164
+ api_key=SecretStr(openai_api_key),
165
+ )
166
+ elif llm_ultimas_requests == "deepseek-chat":
167
+ llm = llm_instance.deepseek()
168
+ elif llm_ultimas_requests == "gemini-2.0-flash":
169
+ llm = llm_instance.google_gemini(
170
+ "gemini-2.0-flash", temperature=self.temperature
171
+ )
172
+ elif llm_ultimas_requests == "gemini-2.5-pro":
173
+ llm = llm_instance.google_gemini(
174
+ "gemini-2.5-pro-preview-05-06", temperature=self.temperature
175
+ )
176
+ elif llm_ultimas_requests == "gemini-2.5-flash":
177
+ llm = llm_instance.google_gemini(
178
+ "gemini-2.5-flash-preview-04-17", temperature=self.temperature
179
+ )
180
+ return llm
181
+
182
+
183
+ class GerarDocumento:
184
+ lista_pdfs: List[str]
185
+ should_use_llama_parse: bool
186
+ all_PDFs_chunks: List[DocumentChunk]
187
+ full_text_as_array: List[str]
188
+ isBubble: bool
189
+ chunks_processados: List[ContextualizedChunk] | List[DocumentChunk]
190
+ resumo_auxiliar: str
191
+ gerar_documento_utils: GerarDocumentoUtils
192
+ utils = UtilsClass()
193
+ llm = LLM()
194
+ enhanced_vector_store: tuple[Chroma, BM25Okapi, List[str]]
195
+ query_gerado_dinamicamente_para_o_vector_store: str
196
+ structured_output: List[Any]
197
+ texto_completo_como_html: str
198
+ titulo_do_documento: str
199
+ encoding_tiktoken = tiktoken.get_encoding("cl100k_base")
200
+ serializer: Union[
201
+ GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
202
+ ]
203
+
204
+ def __init__(
205
+ self,
206
+ serializer: Union[
207
+ GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
208
+ ],
209
+ isBubble: bool,
210
+ axiom_instance: Axiom,
211
+ ):
212
+ self.gerar_documento_utils = GerarDocumentoUtils(axiom_instance)
213
+ self.gerar_documento_utils.temperature = serializer.gpt_temperature
214
+ self.config = self.gerar_documento_utils.create_retrieval_config(serializer)
215
+ self.serializer = serializer
216
  self.logger = logging.getLogger(__name__)
217
  # self.prompt_auxiliar = prompt_auxiliar
218
  self.gpt_model = serializer.model
219
+ self.llm_temperature = serializer.gpt_temperature
220
  self.prompt_gerar_documento = serializer.prompt_gerar_documento
221
+ self.should_use_llama_parse = serializer.should_use_llama_parse
222
+ self.isBubble = isBubble
223
+ self.is_contextualized_chunk = serializer.should_have_contextual_chunks
224
+ self.contextual_retriever = ContextualRetriever(serializer)
225
+ self.llm_ultimas_requests = serializer.llm_ultimas_requests
226
 
227
+ self.cohere_client = Client(cohere_api_key)
 
228
  self.embeddings = HuggingFaceEmbeddings(model_name=serializer.hf_embedding)
229
  self.num_k_rerank = serializer.num_k_rerank
230
  self.model_cohere_rerank = serializer.model_cohere_rerank
 
234
 
235
  self.vector_store = VectorStore(serializer.hf_embedding)
236
  self.axiom_instance: Axiom = axiom_instance
237
+ self.ax = AxiomLogs(axiom_instance)
238
+
239
+ async def get_text_and_pdf_chunks(self):
240
+ all_PDFs_chunks, full_text_as_array = (
241
+ await self.utils.handle_files.get_full_text_and_all_PDFs_chunks(
242
+ self.lista_pdfs,
243
+ self.splitter,
244
+ self.should_use_llama_parse,
245
+ self.isBubble,
246
+ )
247
+ )
248
+ self.ax.texto_completo_pdf(full_text_as_array)
249
+
250
+ self.all_PDFs_chunks = all_PDFs_chunks
251
+ self.full_text_as_array = full_text_as_array
252
+ return all_PDFs_chunks, full_text_as_array
253
+
254
+ async def generate_chunks_processados(self):
255
+ if self.is_contextualized_chunk:
256
+ self.ax.inicio_requisicao_contextual()
257
+ contextualized_chunks = (
258
+ await self.contextual_retriever.contextualize_all_chunks(
259
+ self.all_PDFs_chunks, self.resumo_auxiliar, self.axiom_instance
260
+ )
261
+ )
262
+ self.ax.fim_requisicao_contextual()
263
+
264
+ chunks_processados = (
265
+ contextualized_chunks
266
+ if self.is_contextualized_chunk
267
+ else self.all_PDFs_chunks
268
+ )
269
+ self.chunks_processados = chunks_processados
270
+ if len(self.chunks_processados) == 0:
271
+ self.chunks_processados = self.all_PDFs_chunks
272
+ self.ax.chunks_inicialmente(self.chunks_processados)
273
+ return self.chunks_processados
274
+
275
+ async def generate_query_for_vector_store(self):
276
+ prompt_para_gerar_query_dinamico = prompt_gerar_query_dinamicamente(
277
+ cast(str, self.resumo_auxiliar)
278
+ )
279
+
280
+ self.axiom_instance.send_axiom(
281
+ "COMEÇANDO REQUISIÇÃO PARA GERAR O QUERY DINAMICAMENTE DO VECTOR STORE"
282
+ )
283
+ response = await self.llm.google_gemini_ainvoke(
284
+ prompt_para_gerar_query_dinamico,
285
+ "gemini-2.0-flash",
286
+ temperature=self.llm_temperature,
287
+ )
288
+
289
+ self.query_gerado_dinamicamente_para_o_vector_store = cast(
290
+ str, response.content
291
+ )
292
+
293
+ self.axiom_instance.send_axiom(
294
+ f"query_gerado_dinamicamente_para_o_vector_store: {self.query_gerado_dinamicamente_para_o_vector_store}",
295
+ )
296
+
297
+ return self.query_gerado_dinamicamente_para_o_vector_store
298
+
299
+ async def create_enhanced_vector_store(self):
300
+ vector_store, bm25, chunk_ids = self.vector_store.create_enhanced_vector_store(
301
+ self.chunks_processados, self.is_contextualized_chunk, self.axiom_instance # type: ignore
302
+ )
303
+
304
+ self.enhanced_vector_store = vector_store, bm25, chunk_ids
305
+ return vector_store, bm25, chunk_ids
306
 
307
  def retrieve_with_rank_fusion(
308
  self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
 
392
 
393
  return sources, contexts
394
 
395
+ async def do_last_requests(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  self,
 
 
 
 
 
397
  ) -> List[Dict]:
398
  try:
399
+ self.axiom_instance.send_axiom("COMEÇANDO A FAZER ÚLTIMA REQUISIÇÃO")
400
+ vector_store, bm25, chunk_ids = self.enhanced_vector_store
401
+
402
  sources, contexts = self.rank_fusion_get_top_results(
403
+ vector_store,
404
+ bm25,
405
+ chunk_ids,
406
+ self.query_gerado_dinamicamente_para_o_vector_store,
407
  )
408
 
409
  prompt_gerar_documento = PromptTemplate(
 
411
  input_variables=["context"],
412
  )
413
 
414
+ llm_ultimas_requests = self.llm_ultimas_requests
415
  prompt_instance = Prompt()
416
  context_do_prompt_primeira_etapa = "\n\n".join(contexts)
417
  prompt_primeira_etapa = prompt_gerar_documento.format(
418
  context=context_do_prompt_primeira_etapa,
419
  )
420
 
421
+ self.gerar_documento_utils.model = self.gpt_model
422
+ self.gerar_documento_utils.temperature = self.llm_temperature
423
+ documento_gerado = await self.gerar_documento_utils.checar_se_resposta_vazia_do_documento_final(
424
  llm_ultimas_requests, prompt_primeira_etapa
425
  )
426
 
 
438
  dynamic_dict={"context": context_do_prompt_primeira_etapa},
439
  )
440
  # documento_gerado = llm.invoke(prompt_etapa_2).content
441
+ documento_gerado = self.gerar_documento_utils.checar_se_resposta_vazia_do_documento_final(
442
  llm_ultimas_requests, prompt_etapa_2.to_string()
443
  )
444
  resposta_segunda_etapa = documento_gerado
 
456
  },
457
  )
458
  # documento_gerado = llm.invoke(prompt_etapa_3).content
459
+ documento_gerado = self.gerar_documento_utils.checar_se_resposta_vazia_do_documento_final(
460
  llm_ultimas_requests, prompt_etapa_3.to_string()
461
  )
462
  texto_final_juntando_as_etapas += f"\n\n{documento_gerado}"
 
470
  structured_output = self.gerar_documento_utils.criar_output_estruturado(
471
  summaries, sources
472
  )
473
+
474
+ self.axiom_instance.send_axiom("TERMINOU DE FAZER A ÚLTIMA REQUISIÇÃO")
475
+ self.structured_output = structured_output
476
  return structured_output
477
 
478
  except Exception as e:
479
  self.logger.error(f"Error generating enhanced summary: {str(e)}")
480
  raise
481
 
482
+ async def generate_complete_text(self):
483
+ texto_completo = "\n\n"
 
 
 
 
484
 
485
+ for x in self.structured_output:
486
+ texto_completo = texto_completo + x["content"] + "\n"
487
+ x["source"]["text"] = x["source"]["text"][0:200]
488
+ x["source"]["context"] = x["source"]["context"][0:200]
 
 
 
489
 
490
+ self.texto_completo_como_html = convert_markdown_to_HTML(
491
+ texto_completo
492
+ ).replace("resposta_segunda_etapa:", "<br><br>")
 
 
 
 
493
 
494
+ self.axiom_instance.send_axiom(
495
+ f"texto_completo_como_html: {self.texto_completo_como_html}"
496
+ )
 
 
 
 
 
 
 
 
 
497
 
498
+ async def get_document_title(self):
499
+ if self.is_contextualized_chunk:
500
+ resumo_para_gerar_titulo = self.resumo_auxiliar
501
+ else:
502
+ resumo_para_gerar_titulo = self.texto_completo_como_html
503
+
504
+ prompt = prompt_para_gerar_titulo(resumo_para_gerar_titulo)
505
+ response = await agemini_answer(
506
+ prompt, "gemini-2.0-flash-lite", temperature=self.llm_temperature
507
+ )
508
+ self.titulo_do_documento = response
509
+ return self.titulo_do_documento
510
+
511
+ async def send_to_bubble(self):
512
+ self.axiom_instance.send_axiom("COMEÇANDO A REQUISIÇÃO FINAL PARA O BUBBLE")
513
+
514
+ enviar_resposta_final(
515
+ self.serializer.doc_id, # type: ignore
516
+ self.serializer.form_response_id, # type: ignore
517
+ self.serializer.version, # type: ignore
518
+ self.texto_completo_como_html,
519
+ False,
520
+ cast(str, self.titulo_do_documento),
521
+ )
522
+
523
+ self.axiom_instance.send_axiom("TERMINOU A REQUISIÇÃO FINAL PARA O BUBBLE")
524
 
525
  async def gerar_ementa_final(
526
  self,
 
529
  context_primeiro_prompt: str,
530
  ):
531
 
532
+ llm = self.gerar_documento_utils.select_model_for_last_requests(llm_ultimas_requests) # type: ignore
533
  prompt_instance = Prompt()
534
 
535
+ documento_gerado = await self.gerar_documento_utils.checar_se_resposta_vazia_do_documento_final(
536
  llm_ultimas_requests, prompt_primeira_etapa
537
  )
538
 
 
567
  self.axiom_instance.send_axiom(f"RESULTADO ETAPA 3: {documento_gerado}")
568
 
569
  return texto_final_juntando_as_etapas
570
+
571
+ # Esta função gera a resposta que será usada em cada um das requisições de cada chunk
572
+ async def get_response_from_auxiliar_contextual_prompt(self):
573
+ llms = LLM()
574
+ responses = []
575
+
576
+ current_chunk = []
577
+ current_token_count = 0
578
+ chunk_counter = 1
579
+
580
+ for part in self.full_text_as_array:
581
+ part_tokens = len(self.encoding_tiktoken.encode(part))
582
+
583
+ # Check if adding this part would EXCEED the limit
584
+ if current_token_count + part_tokens > 600000:
585
+ # Process the accumulated chunk before it exceeds the limit
586
+ chunk_text = "".join(current_chunk)
587
+ print(
588
+ f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
589
+ )
590
+
591
+ prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
592
+ response = await llms.google_gemini(
593
+ temperature=self.llm_temperature
594
+ ).ainvoke([HumanMessage(content=prompt)])
595
+ responses.append(response.content)
596
+
597
+ # Start new chunk with current part
598
+ current_chunk = [part]
599
+ current_token_count = part_tokens
600
+ chunk_counter += 1
601
+ else:
602
+ # Safe to add to current chunk
603
+ current_chunk.append(part)
604
+ current_token_count += part_tokens
605
+
606
+ # Process the final remaining chunk
607
+ if current_chunk:
608
+ chunk_text = "".join(current_chunk)
609
+ print(
610
+ f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
611
+ )
612
+ prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
613
+ response = await llms.google_gemini(
614
+ temperature=self.llm_temperature
615
+ ).ainvoke([HumanMessage(content=prompt)])
616
+ responses.append(response.content)
617
+
618
+ self.resumo_auxiliar = "".join(responses)
619
+ self.ax.resumo_inicial_processo(self.resumo_auxiliar)
620
+
621
+ return self.resumo_auxiliar
622
+
623
+ def gerar_resposta_compilada(self):
624
+ serializer = self.serializer
625
+ return {
626
+ "num_chunks_retrieval": serializer.num_chunks_retrieval,
627
+ "embedding_weight": serializer.embedding_weight,
628
+ "bm25_weight": serializer.bm25_weight,
629
+ "context_window": serializer.context_window,
630
+ "chunk_overlap": serializer.chunk_overlap,
631
+ "num_k_rerank": serializer.num_k_rerank,
632
+ "model_cohere_rerank": serializer.model_cohere_rerank,
633
+ "more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
634
+ "claude_context_model": serializer.claude_context_model,
635
+ "gpt_temperature": serializer.gpt_temperature,
636
+ "user_message": serializer.user_message,
637
+ "model": serializer.model,
638
+ "hf_embedding": serializer.hf_embedding,
639
+ "chunk_size": serializer.chunk_size,
640
+ "chunk_overlap": serializer.chunk_overlap,
641
+ # "prompt_auxiliar": serializer.prompt_auxiliar,
642
+ "prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
643
+ }
_utils/gerar_documento_utils/llm_calls.py CHANGED
@@ -62,8 +62,9 @@ async def agemini_answer(
62
  model: Literal[
63
  "gemini-2.5-pro-preview-05-06", "gemini-2.0-flash", "gemini-2.0-flash-lite"
64
  ] = "gemini-2.0-flash",
 
65
  ) -> str:
66
- gemini = llm.google_gemini(model)
67
  resposta = await gemini.ainvoke([HumanMessage(content=prompt)])
68
 
69
  if isinstance(resposta.content, list):
 
62
  model: Literal[
63
  "gemini-2.5-pro-preview-05-06", "gemini-2.0-flash", "gemini-2.0-flash-lite"
64
  ] = "gemini-2.0-flash",
65
+ temperature=0.4,
66
  ) -> str:
67
+ gemini = llm.google_gemini(model, temperature)
68
  resposta = await gemini.ainvoke([HumanMessage(content=prompt)])
69
 
70
  if isinstance(resposta.content, list):
_utils/gerar_documento_utils/prompts.py CHANGED
@@ -1,4 +1,14 @@
1
- def create_prompt_auxiliar_do_contextual_prompt(PROCESSO_JURIDICO: str):
 
 
 
 
 
 
 
 
 
 
2
  return f"""
3
  <prompt>
4
  <persona>
@@ -46,10 +56,7 @@ Seu objetivo é analisar o processo jurídico fornecido e gerar um relatório co
46
  <instrucoes>
47
  Siga estritamente os passos abaixo:
48
 
49
- 1. **Análise Completa:** Leia e analise todo o conteúdo do processo fornecido.
50
- <processo_juridico>
51
- {PROCESSO_JURIDICO}
52
- </processo_juridico>
53
 
54
  2. **Identificação e Listagem de Peças:** Identifique quais das peças listadas na `<tarefa>` estão presentes no texto. Liste **apenas** as encontradas na tag `<pecas_identificadas>`.
55
 
@@ -239,3 +246,7 @@ def prompt_gerar_query_dinamicamente(resumo_do_processo: str):
239
 
240
  - *Importante:** Sua resposta final deve ser *somente* a string da query.
241
  """
 
 
 
 
 
1
+ def create_prompt_auxiliar_do_contextual_prompt(PROCESSO_JURIDICO: str | None = None):
2
+ if PROCESSO_JURIDICO:
3
+ adicionar_ao_prompt = f"""
4
+ 1. **Análise Completa:** Leia e analise todo o conteúdo do processo fornecido.
5
+ <processo_juridico>
6
+ {PROCESSO_JURIDICO}
7
+ </processo_juridico>"""
8
+ else:
9
+ adicionar_ao_prompt = """
10
+ 1. **Análise Completa:** Leia e analise todo o conteúdo do processo fornecido como PDF."""
11
+
12
  return f"""
13
  <prompt>
14
  <persona>
 
56
  <instrucoes>
57
  Siga estritamente os passos abaixo:
58
 
59
+ {adicionar_ao_prompt}
 
 
 
60
 
61
  2. **Identificação e Listagem de Peças:** Identifique quais das peças listadas na `<tarefa>` estão presentes no texto. Liste **apenas** as encontradas na tag `<pecas_identificadas>`.
62
 
 
246
 
247
  - *Importante:** Sua resposta final deve ser *somente* a string da query.
248
  """
249
+
250
+
251
+ def prompt_para_gerar_titulo(resumo_para_gerar_titulo: str):
252
+ return f"Você é um assistente jurídico e irá receber abaixo o resumo de um documento jurídico. Quero que você gere um título para este documento. Mande como resposta apenas o título gerado, nada mais. Aqui está um título de exemplo pra você se basear ao criar um novo: <titulo_de_exemplo>Ação Penal por Furto Qualificado nº 0002269-86.2009.805.0032<titulo_de_exemplo>\n\nSegue abaixo o resumo do documento jurídico:\n{resumo_para_gerar_titulo}"
_utils/gerar_documento_utils/utils.py CHANGED
@@ -1,91 +1,10 @@
1
- from typing import Any, List, Tuple, Union
2
- from langchain_core.documents import Document
3
- from langchain_core.messages import HumanMessage
4
-
5
  from _utils.gerar_documento_utils.llm_calls import agemini_answer
6
- from _utils.langchain_utils.Splitter_class import Splitter
7
- from _utils.langchain_utils.LLM_class import LLM
8
- from _utils.gerar_documento_utils.prompts import (
9
- create_prompt_auxiliar_do_contextual_prompt,
10
- )
11
-
12
- from _utils.models.gerar_documento import DocumentChunk
13
- from gerar_documento.serializer import GerarDocumentoSerializerData
14
  import tiktoken
15
 
16
  encoding = tiktoken.get_encoding("cl100k_base")
17
 
18
 
19
- def gerar_resposta_compilada(serializer: Union[GerarDocumentoSerializerData, Any]):
20
- return {
21
- "num_chunks_retrieval": serializer.num_chunks_retrieval,
22
- "embedding_weight": serializer.embedding_weight,
23
- "bm25_weight": serializer.bm25_weight,
24
- "context_window": serializer.context_window,
25
- "chunk_overlap": serializer.chunk_overlap,
26
- "num_k_rerank": serializer.num_k_rerank,
27
- "model_cohere_rerank": serializer.model_cohere_rerank,
28
- "more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
29
- "claude_context_model": serializer.claude_context_model,
30
- "gpt_temperature": serializer.gpt_temperature,
31
- "user_message": serializer.user_message,
32
- "model": serializer.model,
33
- "hf_embedding": serializer.hf_embedding,
34
- "chunk_size": serializer.chunk_size,
35
- "chunk_overlap": serializer.chunk_overlap,
36
- # "prompt_auxiliar": serializer.prompt_auxiliar,
37
- "prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
38
- }
39
-
40
-
41
- # Esta função gera a resposta que será usada em cada um das requisições de cada chunk
42
- async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
43
- llms = LLM()
44
- responses = []
45
-
46
- current_chunk = []
47
- current_token_count = 0
48
- chunk_counter = 1
49
-
50
- for part in full_text_as_array:
51
- part_tokens = len(encoding.encode(part))
52
-
53
- # Check if adding this part would EXCEED the limit
54
- if current_token_count + part_tokens > 600000:
55
- # Process the accumulated chunk before it exceeds the limit
56
- chunk_text = "".join(current_chunk)
57
- print(
58
- f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
59
- )
60
-
61
- prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
62
- response = await llms.google_gemini().ainvoke(
63
- [HumanMessage(content=prompt)]
64
- )
65
- responses.append(response.content)
66
-
67
- # Start new chunk with current part
68
- current_chunk = [part]
69
- current_token_count = part_tokens
70
- chunk_counter += 1
71
- else:
72
- # Safe to add to current chunk
73
- current_chunk.append(part)
74
- current_token_count += part_tokens
75
-
76
- # Process the final remaining chunk
77
- if current_chunk:
78
- chunk_text = "".join(current_chunk)
79
- print(
80
- f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
81
- )
82
- prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
83
- response = await llms.google_gemini().ainvoke([HumanMessage(content=prompt)])
84
- responses.append(response.content)
85
-
86
- return "".join(responses)
87
-
88
-
89
  def split_text_by_tokens(full_text: str):
90
  tokens = encoding.encode(full_text)
91
  max_tokens = 600000
@@ -101,27 +20,7 @@ def split_text_by_tokens(full_text: str):
101
  return text_chunks
102
 
103
 
104
- async def get_full_text_and_all_PDFs_chunks(
105
- listaPDFs: List[str],
106
- splitterObject: Splitter,
107
- should_use_llama_parse: bool,
108
- isBubble: bool,
109
- ) -> Tuple[List[DocumentChunk], List[str]]:
110
- all_PDFs_chunks: List[DocumentChunk] = []
111
-
112
- pages: List[str] = []
113
-
114
- # Load and process document
115
- for pdf_path in listaPDFs:
116
- chunks, pages = await splitterObject.load_and_split_document(
117
- pdf_path, should_use_llama_parse, isBubble
118
- )
119
- all_PDFs_chunks = all_PDFs_chunks + chunks
120
-
121
- return all_PDFs_chunks, pages
122
-
123
-
124
  async def generate_document_title(resumo_para_gerar_titulo: str):
125
- prompt = f"Você é um assistente jurídico e irá receber abaixo o resumo de um documento jurídico. Quero que você gere um título para este documento. Mande como resposta apenas o título gerado, nada mais. Aqui está um título de exemplo pra você se basear ao criar um novo: <titulo_de_exemplo>Ação Penal por Furto Qualificado nº 0002269-86.2009.805.0032<titulo_de_exemplo>\n\nSegue abaixo o resumo do documento jurídico:\n{resumo_para_gerar_titulo}"
126
  response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
127
  return response
 
 
 
 
 
1
  from _utils.gerar_documento_utils.llm_calls import agemini_answer
2
+ from _utils.gerar_documento_utils.prompts import prompt_para_gerar_titulo
 
 
 
 
 
 
 
3
  import tiktoken
4
 
5
  encoding = tiktoken.get_encoding("cl100k_base")
6
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def split_text_by_tokens(full_text: str):
9
  tokens = encoding.encode(full_text)
10
  max_tokens = 600000
 
20
  return text_chunks
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  async def generate_document_title(resumo_para_gerar_titulo: str):
24
+ prompt = prompt_para_gerar_titulo(resumo_para_gerar_titulo)
25
  response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
26
  return response
_utils/google_integration/google_cloud.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from google.cloud import storage
3
+
4
+ GCP_PROJECT = "gen-lang-client-0350149082"
5
+ GCP_REGION = "us-central1"
6
+ DOCUMENT_API_ID = "b34a20d22dee16bb"
7
+ GCS_BUCKET_NAME = "vella-pdfs"
8
+
9
+
10
+ def upload_to_gcs(LOCAL_PDF_PATH: str) -> str:
11
+
12
+ # Path in GCS
13
+ GCS_DESTINATION_BLOB_NAME = "gemini_uploads/" + os.path.basename(LOCAL_PDF_PATH)
14
+
15
+ """Uploads a file to a GCS bucket and returns its URI."""
16
+ storage_client = storage.Client(
17
+ project=GCP_PROJECT,
18
+ )
19
+ bucket = storage_client.bucket(GCS_BUCKET_NAME)
20
+ blob = bucket.blob(GCS_DESTINATION_BLOB_NAME)
21
+
22
+ print(
23
+ f"Uploading {LOCAL_PDF_PATH} to gs://{GCS_BUCKET_NAME}/{GCS_DESTINATION_BLOB_NAME}..."
24
+ )
25
+ blob.upload_from_filename(LOCAL_PDF_PATH)
26
+ gcs_uri = f"gs://{GCS_BUCKET_NAME}/{GCS_DESTINATION_BLOB_NAME}"
27
+ print(f"File uploaded to {gcs_uri}")
28
+ return gcs_uri
_utils/langchain_utils/LLM_class.py CHANGED
@@ -1,9 +1,10 @@
1
- from typing import Literal, cast
2
  from pydantic import SecretStr
3
- from setup.environment import default_model
4
  from setup.easy_imports import ChatOpenAI, ChatGoogleGenerativeAI
5
  import os
6
  from langchain_core.messages import HumanMessage
 
7
 
8
  deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
9
  google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
@@ -31,14 +32,11 @@ class LLM:
31
  model=model,
32
  )
33
 
34
- def google_gemini(
35
- self,
36
- model: Google_llms = "gemini-2.0-flash",
37
- ):
38
  return ChatGoogleGenerativeAI(
39
  api_key=SecretStr(google_api_key),
40
  model=model,
41
- temperature=0.4,
42
  max_tokens=None,
43
  timeout=None,
44
  max_retries=2,
@@ -49,10 +47,11 @@ class LLM:
49
  prompt: str,
50
  model: Google_llms = "gemini-2.0-flash",
51
  max_retries: int = 3,
 
52
  ):
53
  for attempt in range(max_retries):
54
  try:
55
- response = await self.google_gemini(model).ainvoke(
56
  [HumanMessage(content=prompt)]
57
  )
58
 
@@ -75,3 +74,44 @@ class LLM:
75
  raise Exception(
76
  "Failed to generate the final document after 5 retries and the fallback attempt with chat-gpt-4o-mini."
77
  ) from e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Literal, cast
2
  from pydantic import SecretStr
3
+ from _utils.google_integration.google_cloud import GCP_PROJECT, upload_to_gcs
4
  from setup.easy_imports import ChatOpenAI, ChatGoogleGenerativeAI
5
  import os
6
  from langchain_core.messages import HumanMessage
7
+ from langchain_google_vertexai import ChatVertexAI
8
 
9
  deepseek_api_key = cast(str, os.environ.get("DEEPSEEKK_API_KEY"))
10
  google_api_key = cast(str, os.environ.get("GOOGLE_API_KEY_PEIXE"))
 
32
  model=model,
33
  )
34
 
35
+ def google_gemini(self, model: Google_llms = "gemini-2.0-flash", temperature=0.4):
 
 
 
36
  return ChatGoogleGenerativeAI(
37
  api_key=SecretStr(google_api_key),
38
  model=model,
39
+ temperature=temperature,
40
  max_tokens=None,
41
  timeout=None,
42
  max_retries=2,
 
47
  prompt: str,
48
  model: Google_llms = "gemini-2.0-flash",
49
  max_retries: int = 3,
50
+ temperature=0.4,
51
  ):
52
  for attempt in range(max_retries):
53
  try:
54
+ response = await self.google_gemini(model, temperature).ainvoke(
55
  [HumanMessage(content=prompt)]
56
  )
57
 
 
74
  raise Exception(
75
  "Failed to generate the final document after 5 retries and the fallback attempt with chat-gpt-4o-mini."
76
  ) from e
77
+
78
+ async def google_gemini_vertex_ainvoke(
79
+ self,
80
+ prompt: str,
81
+ list_of_pdfs: List[str],
82
+ model: Google_llms = "gemini-2.5-flash-preview-04-17",
83
+ max_retries: int = 3,
84
+ ) -> str | None:
85
+ message_parts = [
86
+ {"type": "text", "text": prompt},
87
+ ]
88
+ for pdf in list_of_pdfs:
89
+ pdf_gcs_uri = upload_to_gcs(pdf)
90
+ message_parts.append(
91
+ {
92
+ # This structure is used for file references via URI
93
+ "type": "media",
94
+ "mime_type": "application/pdf", # <-- mime_type moved up
95
+ "file_uri": pdf_gcs_uri, # <-- file_uri moved up
96
+ }
97
+ )
98
+
99
+ for attempt in range(max_retries):
100
+ try:
101
+ llm = ChatVertexAI(
102
+ model_name=model,
103
+ project=GCP_PROJECT,
104
+ location="us-central1",
105
+ temperature=0,
106
+ )
107
+ response = await llm.ainvoke(
108
+ [HumanMessage(content=message_parts)] # type: ignore
109
+ )
110
+
111
+ if isinstance(response.content, list):
112
+ response.content = "\n".join(response.content) # type: ignore
113
+
114
+ return response.content # type: ignore
115
+ except Exception as e:
116
+ model = "gemini-2.0-flash"
117
+ print(f"Attempt {attempt + 1} failed with error: {e}")
_utils/langchain_utils/Splitter_class.py CHANGED
@@ -1,4 +1,9 @@
1
- from _utils.bubble_integrations.obter_arquivo import get_pdf_from_bubble
 
 
 
 
 
2
  from _utils.handle_files import return_document_list_with_llama_parser
3
  from _utils.langchain_utils.splitter_util import (
4
  Splitter_Simple,
@@ -18,6 +23,17 @@ from _utils.models.gerar_documento import (
18
  DocumentChunk,
19
  )
20
  import uuid
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  class Splitter:
@@ -34,7 +50,10 @@ class Splitter:
34
  self.chunk_metadata = {} # Store chunk metadata for tracing
35
 
36
  async def load_and_split_document(
37
- self, pdf_path: str, should_use_llama_parse: bool, isBubble: bool
 
 
 
38
  ):
39
  """Load PDF and split into chunks with metadata"""
40
  # loader = PyPDFLoader(pdf_path)
@@ -144,6 +163,13 @@ class Splitter:
144
  # char_count += len(text)
145
  print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
146
 
 
 
 
 
 
 
 
147
  return chunks, chunks_of_string_only
148
 
149
  def load_and_split_text(self, text: str) -> List[DocumentChunk]:
@@ -185,3 +211,132 @@ class Splitter:
185
  char_count += len(text)
186
 
187
  return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from _utils.bubble_integrations.obter_arquivo import (
4
+ download_file_from_bubble,
5
+ get_pdf_from_bubble,
6
+ )
7
  from _utils.handle_files import return_document_list_with_llama_parser
8
  from _utils.langchain_utils.splitter_util import (
9
  Splitter_Simple,
 
23
  DocumentChunk,
24
  )
25
  import uuid
26
+ import json
27
+ from _utils.google_integration.google_cloud import (
28
+ DOCUMENT_API_ID,
29
+ GCP_PROJECT,
30
+ GCP_REGION,
31
+ GCS_BUCKET_NAME,
32
+ upload_to_gcs,
33
+ )
34
+ from google.cloud import documentai
35
+ from google.cloud import storage
36
+ from _utils.bubble_integrations.obter_arquivo import headers
37
 
38
 
39
  class Splitter:
 
50
  self.chunk_metadata = {} # Store chunk metadata for tracing
51
 
52
  async def load_and_split_document(
53
+ self,
54
+ pdf_path: str,
55
+ should_use_llama_parse: bool,
56
+ isBubble: bool,
57
  ):
58
  """Load PDF and split into chunks with metadata"""
59
  # loader = PyPDFLoader(pdf_path)
 
163
  # char_count += len(text)
164
  print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
165
 
166
+ if len(pages) == 0 or len(chunks) == 0:
167
+ if isBubble:
168
+ pdf_path = download_file_from_bubble(pdf_path, headers, "pdf")
169
+ text = await self.getOCRFromGoogleDocumentAPI(pdf_path)
170
+ chunks = self.load_and_split_text(text) # type: ignore
171
+ chunks_of_string_only = [chunk.content for chunk in chunks]
172
+
173
  return chunks, chunks_of_string_only
174
 
175
  def load_and_split_text(self, text: str) -> List[DocumentChunk]:
 
211
  char_count += len(text)
212
 
213
  return chunks
214
+
215
+ async def getOCRFromGoogleDocumentAPI(self, pdf_path: str):
216
+
217
+ pdf_gcs_uri = upload_to_gcs(pdf_path)
218
+
219
+ GCS_OUTPUT_PREFIX = "documentai_output/"
220
+ # GCS_INPUT_URI = f"gs://{GCS_BUCKET_NAME}/{f"gemini_uploads/{pdf_gcs_uri}"}"
221
+ GCS_INPUT_URI = pdf_gcs_uri
222
+ GCS_OUTPUT_URI = f"gs://{GCS_BUCKET_NAME}/{GCS_OUTPUT_PREFIX}"
223
+
224
+ docai_client = documentai.DocumentProcessorServiceClient()
225
+
226
+ processor_name = docai_client.processor_path(
227
+ project=GCP_PROJECT, location="us", processor=DOCUMENT_API_ID
228
+ )
229
+
230
+ gcs_document = documentai.GcsDocument(
231
+ gcs_uri=GCS_INPUT_URI,
232
+ mime_type="application/pdf", # Mime type is specified here for GcsDocument
233
+ )
234
+
235
+ gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
236
+
237
+ # 3. Create the BatchDocumentsInputConfig
238
+ input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
239
+ # Note: If GCS_INPUT_URI was a prefix for multiple files, you'd use GcsPrefix:
240
+ # gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=GCS_INPUT_URI_PREFIX)
241
+ # input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix, mime_type="application/pdf")
242
+
243
+ # 4. Create the DocumentOutputConfig
244
+ # GCS_OUTPUT_URI should be a gs:// URI prefix where the output JSONs will be stored
245
+ output_config = documentai.DocumentOutputConfig(
246
+ gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
247
+ gcs_uri=GCS_OUTPUT_URI
248
+ )
249
+ )
250
+
251
+ # 5. Construct the BatchProcessRequest
252
+ request = documentai.BatchProcessRequest(
253
+ name=processor_name,
254
+ input_documents=input_config, # Use 'input_documents'
255
+ document_output_config=output_config, # Use 'document_output_config'
256
+ )
257
+
258
+ # Submit the batch process request (this is a long-running operation)
259
+ operation = docai_client.batch_process_documents(request)
260
+
261
+ print("Batch processing operation started. Waiting for completion...")
262
+ while not operation.done():
263
+ time.sleep(15) # Wait for 30 seconds before checking again
264
+ print("Waiting...")
265
+
266
+ print("Batch processing operation finished.")
267
+
268
+ # --- Download the results from GCS ---
269
+ storage_client = storage.Client(
270
+ project=GCP_PROJECT
271
+ ) # Uses GOOGLE_APPLICATION_CREDENTIALS/ADC
272
+ bucket = storage_client.bucket(GCS_BUCKET_NAME)
273
+
274
+ output_blobs = storage_client.list_blobs(
275
+ GCS_BUCKET_NAME, prefix=GCS_OUTPUT_PREFIX
276
+ )
277
+
278
+ downloaded_files_texts = []
279
+ try:
280
+ for blob in output_blobs:
281
+ # Document AI adds suffixes and subdirectories. Look for the actual JSON output files.
282
+ # The exact naming depends on the processor and options. Common pattern is ending with .json
283
+ if blob.name.endswith(".json"):
284
+ local_download_path = os.path.basename(
285
+ blob.name
286
+ ) # Download to current directory with blob name
287
+ print(f"Downloading {blob.name} to {local_download_path}...")
288
+ blob.download_to_filename(local_download_path)
289
+
290
+ with open(local_download_path, "r", encoding="utf-8") as f:
291
+ document_data = json.load(f)
292
+
293
+ # The top-level 'text' field contains the concatenated plain text.
294
+ if "text" in document_data and document_data["text"] is not None:
295
+ raw_text = document_data["text"]
296
+ print(f"\n--- Raw Text Extracted from {blob.name} ---")
297
+ # Print only a snippet or process as needed
298
+ print(
299
+ raw_text[:1000] + "..."
300
+ if len(raw_text) > 1000
301
+ else raw_text
302
+ )
303
+ print("--------------------------------------------")
304
+
305
+ return raw_text
306
+
307
+ # Optional: Store the text. If you processed a batch of files,
308
+ # you might want to associate the text with the original file name.
309
+ # Document AI metadata might link output JSONs back to input files.
310
+ # For simplicity here, let's just show the extraction.
311
+ # If you know it was a single input PDF, this is all the text.
312
+ # If it was multiple, you'd need a mapping or process each JSON.
313
+
314
+ else:
315
+ print(
316
+ f"Warning: 'text' field not found in {blob.name} or is empty."
317
+ )
318
+
319
+ # Optional: Read and print a snippet of the JSON content
320
+ # with open(local_download_path, 'r', encoding='utf-8') as f:
321
+ # data = json.load(f)
322
+ # # Print some extracted text, for example (structure varies by processor)
323
+ # if 'text' in data:
324
+ # print(f"Extracted text snippet: {data['text'][:500]}...") # Print first 500 chars
325
+ # elif 'entities' in data:
326
+ # print(f"Number of entities found: {len(data['entities'])}")
327
+ # else:
328
+ # print("Output JSON structure not immediately recognizable.")
329
+ # break # Uncomment if you only expect/need to process the first output file
330
+
331
+ if len(downloaded_files_texts) == 0 or not downloaded_files_texts:
332
+ print("No JSON output files found in the specified output location.")
333
+
334
+ except Exception as e:
335
+ print(f"Error listing or downloading output files: {e}")
336
+
337
+ print("\nProcess complete.")
338
+ if downloaded_files_texts:
339
+ print(f"Downloaded output file(s): {', '.join(downloaded_files_texts)}")
340
+ print("These files contain the OCR results in JSON format.")
341
+ else:
342
+ print("No output files were successfully downloaded.")
_utils/langchain_utils/Vector_store_class.py CHANGED
@@ -22,6 +22,8 @@ class VectorStore:
22
  axiom_instance: Axiom,
23
  ) -> Tuple[Chroma, BM25Okapi, List[str]]:
24
  """Create vector store and BM25 index with contextualized chunks"""
 
 
25
  try:
26
  # Prepare texts with context
27
  if is_contextualized_chunk:
@@ -69,5 +71,9 @@ class VectorStore:
69
  return vector_store, bm25, chunk_ids
70
 
71
  except Exception as e:
 
 
 
 
72
  self.logger.error(f"Error creating enhanced vector store: {str(e)}")
73
- raise Exception(f"Error creating enhanced vector store: {str(e)}")
 
22
  axiom_instance: Axiom,
23
  ) -> Tuple[Chroma, BM25Okapi, List[str]]:
24
  """Create vector store and BM25 index with contextualized chunks"""
25
+ contador_erro = 0
26
+
27
  try:
28
  # Prepare texts with context
29
  if is_contextualized_chunk:
 
71
  return vector_store, bm25, chunk_ids
72
 
73
  except Exception as e:
74
+ contador_erro += 1
75
+ if contador_erro >= 2:
76
+ raise Exception(f"Error creating enhanced vector store: {str(e)}")
77
+
78
  self.logger.error(f"Error creating enhanced vector store: {str(e)}")
79
+ return self.create_enhanced_vector_store(chunks, False, axiom_instance)
_utils/ragas.py CHANGED
@@ -76,7 +76,7 @@ def test_ragas(serializer, listaPDFs):
76
 
77
  def generate_summary(vector_store, bm25, chunk_ids, query, summarizer):
78
  """Generates an enhanced summary using the vector store and BM25 index."""
79
- structured_summaries = summarizer.gerar_documento_final(
80
  vector_store, bm25, chunk_ids, query
81
  )
82
  return {"structured_summaries": structured_summaries}
 
76
 
77
  def generate_summary(vector_store, bm25, chunk_ids, query, summarizer):
78
  """Generates an enhanced summary using the vector store and BM25 index."""
79
+ structured_summaries = summarizer.do_last_requests(
80
  vector_store, bm25, chunk_ids, query
81
  )
82
  return {"structured_summaries": structured_summaries}
entrypoint.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ set -e # Exit immediately if a command exits with a non-zero status.
3
+
4
+ CREDENTIALS_FILE_PATH="/app/vella_gcp_luan_credentials.json"
5
+
6
+ # Check if the GCP_CREDENTIALS_JSON_CONTENT secret is provided
7
+ if [ -n "$GCP_CREDENTIALS_JSON_CONTENT" ]; then
8
+ echo "GCP_CREDENTIALS_JSON_CONTENT secret found. Writing to $CREDENTIALS_FILE_PATH"
9
+ # Use printf to preserve newlines and special characters correctly
10
+ printf "%s" "$GCP_CREDENTIALS_JSON_CONTENT" > "$CREDENTIALS_FILE_PATH"
11
+ export GOOGLE_APPLICATION_CREDENTIALS="$CREDENTIALS_FILE_PATH"
12
+ echo "GOOGLE_APPLICATION_CREDENTIALS set to $CREDENTIALS_FILE_PATH"
13
+
14
+ # Optional: Add a check to see if the file looks like JSON (basic check)
15
+ if command -v jq >/dev/null && jq -e . "$CREDENTIALS_FILE_PATH" >/dev/null 2>&1; then
16
+ echo "Credentials file appears to be valid JSON."
17
+ else
18
+ echo "Warning: Credentials file may not be valid JSON. Content:"
19
+ # cat "$CREDENTIALS_FILE_PATH" # Print the content for debugging
20
+ fi
21
+
22
+ else
23
+ echo "Warning: GCP_CREDENTIALS_JSON_CONTENT secret not found. GCP services might not authenticate."
24
+ fi
25
+
26
+ exec "$@"
gerar_documento/serializer.py CHANGED
@@ -73,7 +73,7 @@ class FileInfoSerializerData:
73
  class GerarDocumentoSerializer(
74
  GerarDocumentoInitialSerializer, GerarDocumentoParametros
75
  ):
76
- files = serializers.ListField(child=FileInfoSerializer(), required=True)
77
  bubble_editor_version = serializers.CharField(
78
  required=False, default="version-test"
79
  ) # Será o valor utilizado dentro da URL da requisição pro Bubble
@@ -89,7 +89,7 @@ class GerarDocumentoSerializer(
89
  class GerarDocumentoSerializerData(
90
  GerarDocumentoParametrosData, GerarDocumentoInitialSerializerData
91
  ):
92
- files: List[FileInfoSerializerData]
93
  bubble_editor_version: str = "version-test"
94
 
95
  doc_id: str = ""
@@ -100,6 +100,8 @@ class GerarDocumentoSerializerData(
100
  class GerarDocumentoComPDFProprioSerializer(
101
  GerarDocumentoInitialSerializer, GerarDocumentoParametros
102
  ):
 
 
103
  def get_obj(self):
104
  return GerarDocumentoSerializerData(**self.validated_data) # type: ignore
105
 
@@ -108,17 +110,4 @@ class GerarDocumentoComPDFProprioSerializer(
108
  class GerarDocumentoComPDFProprioSerializerData(
109
  GerarDocumentoParametrosData, GerarDocumentoInitialSerializerData
110
  ):
111
- pass
112
-
113
-
114
- class GerarEmentaSerializer(serializers.Serializer):
115
- files = serializers.ListField(child=FileInfoSerializer(), required=True)
116
- user_message = serializers.CharField(required=False, default="")
117
- chunk_size = serializers.IntegerField(required=False, default=3500)
118
- chunk_overlap = serializers.IntegerField(required=False, default=800)
119
- bubble_editor_version = serializers.CharField(
120
- required=False, default="version-test"
121
- ) # Será o valor utilizado dentro da URL da requisição pro Bubble
122
- doc_id = serializers.CharField(required=True)
123
- form_response_id = serializers.CharField(required=True)
124
- version = serializers.CharField(required=True)
 
73
  class GerarDocumentoSerializer(
74
  GerarDocumentoInitialSerializer, GerarDocumentoParametros
75
  ):
76
+ files = serializers.CharField(required=True)
77
  bubble_editor_version = serializers.CharField(
78
  required=False, default="version-test"
79
  ) # Será o valor utilizado dentro da URL da requisição pro Bubble
 
89
  class GerarDocumentoSerializerData(
90
  GerarDocumentoParametrosData, GerarDocumentoInitialSerializerData
91
  ):
92
+ files: str
93
  bubble_editor_version: str = "version-test"
94
 
95
  doc_id: str = ""
 
100
  class GerarDocumentoComPDFProprioSerializer(
101
  GerarDocumentoInitialSerializer, GerarDocumentoParametros
102
  ):
103
+ files = serializers.ListField(required=True)
104
+
105
  def get_obj(self):
106
  return GerarDocumentoSerializerData(**self.validated_data) # type: ignore
107
 
 
110
  class GerarDocumentoComPDFProprioSerializerData(
111
  GerarDocumentoParametrosData, GerarDocumentoInitialSerializerData
112
  ):
113
+ files: List[FileInfoSerializerData]
 
 
 
 
 
 
 
 
 
 
 
 
 
gerar_documento/views.py CHANGED
@@ -1,13 +1,12 @@
1
  from typing import Any, Dict, cast
2
  from langchain.prompts import PromptTemplate
 
3
  from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_final
4
- from _utils.custom_exception_handler import custom_exception_handler_wihout_api_handler
5
  from _utils.gerar_documento_utils.GerarDocumento import GerarDocumento
6
  from _utils.langchain_utils.LLM_class import LLM
7
  from _utils.gerar_documento_utils.utils import (
8
  generate_document_title,
9
- gerar_resposta_compilada,
10
- get_full_text_and_all_PDFs_chunks,
11
  split_text_by_tokens,
12
  )
13
  from _utils.langchain_utils.Prompt_class import Prompt
@@ -29,10 +28,10 @@ from setup.logging import Axiom, send_axiom
29
  from .serializer import (
30
  GerarDocumentoComPDFProprioSerializer,
31
  GerarDocumentoSerializer,
32
- GerarEmentaSerializer,
33
  )
34
  import asyncio
35
  from _utils.langchain_utils.Splitter_class import Splitter
 
36
 
37
 
38
  class GerarDocumentoView(AsyncAPIView):
@@ -60,7 +59,8 @@ class GerarDocumentoView(AsyncAPIView):
60
  data = cast(Dict[str, Any], serializer.validated_data)
61
  self.serializer = data
62
 
63
- listaPDFs = [l["link_arquivo"] for l in data["files"]]
 
64
 
65
  self.axiom_instance.send_axiom(f"listaPDFs: {listaPDFs}")
66
 
@@ -138,17 +138,23 @@ class GerarEmentaView(AsyncAPIView):
138
 
139
  async def proccess_data_after_response():
140
  try:
 
 
141
  data = cast(Dict[str, Any], serializer.validated_data)
142
  self.serializer = data
143
 
144
- gerar_documento_instance = GerarDocumento(obj, self.axiom_instance)
 
 
145
 
146
- listaPDFs = [l["link_arquivo"] for l in data["files"]]
 
 
147
 
148
  self.axiom_instance.send_axiom(f"listaPDFs: {listaPDFs}")
149
 
150
  all_PDFs_chunks, full_text_as_array = (
151
- await get_full_text_and_all_PDFs_chunks(
152
  listaPDFs,
153
  Splitter(obj.chunk_size, obj.chunk_overlap),
154
  False,
@@ -208,7 +214,7 @@ class GerarEmentaView(AsyncAPIView):
208
  )
209
  except Exception as e:
210
  print(f"ERRO GERAR EMENTA: {e}")
211
- custom_exception_handler_wihout_api_handler(
212
  e, serializer, self.axiom_instance
213
  )
214
  raise
@@ -239,6 +245,8 @@ class GerarEmentaComPDFProprioView(AsyncAPIView):
239
  f"COMEÇOU NOVA REQUISIÇÃO - request.data: {request.data}"
240
  )
241
  serializer = GerarDocumentoComPDFProprioSerializer(data=request.data)
 
 
242
  if serializer.is_valid(raise_exception=True):
243
  data = cast(Dict[str, Any], serializer.validated_data)
244
  self.axiom_instance.send_axiom(f"data: {data}")
@@ -246,7 +254,7 @@ class GerarEmentaComPDFProprioView(AsyncAPIView):
246
  serializer_obj = serializer.get_obj()
247
 
248
  gerar_documento_instance = GerarDocumento(
249
- serializer_obj, self.axiom_instance
250
  )
251
 
252
  listaPDFs = handle_pdf_files_from_serializer(
@@ -255,7 +263,7 @@ class GerarEmentaComPDFProprioView(AsyncAPIView):
255
  self.axiom_instance.send_axiom(f"listaPDFs: {listaPDFs}")
256
 
257
  all_PDFs_chunks, full_text_as_array = (
258
- await get_full_text_and_all_PDFs_chunks(
259
  listaPDFs,
260
  Splitter(serializer_obj.chunk_size, serializer_obj.chunk_overlap),
261
  False,
 
1
  from typing import Any, Dict, cast
2
  from langchain.prompts import PromptTemplate
3
+ from _utils.Utils_Class import UtilsClass
4
  from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_final
5
+ from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
6
  from _utils.gerar_documento_utils.GerarDocumento import GerarDocumento
7
  from _utils.langchain_utils.LLM_class import LLM
8
  from _utils.gerar_documento_utils.utils import (
9
  generate_document_title,
 
 
10
  split_text_by_tokens,
11
  )
12
  from _utils.langchain_utils.Prompt_class import Prompt
 
28
  from .serializer import (
29
  GerarDocumentoComPDFProprioSerializer,
30
  GerarDocumentoSerializer,
 
31
  )
32
  import asyncio
33
  from _utils.langchain_utils.Splitter_class import Splitter
34
+ import json
35
 
36
 
37
  class GerarDocumentoView(AsyncAPIView):
 
59
  data = cast(Dict[str, Any], serializer.validated_data)
60
  self.serializer = data
61
 
62
+ listaPDFs = json.loads(obj.files)
63
+ listaPDFs = [l["link_arquivo"] for l in listaPDFs]
64
 
65
  self.axiom_instance.send_axiom(f"listaPDFs: {listaPDFs}")
66
 
 
138
 
139
  async def proccess_data_after_response():
140
  try:
141
+ util = UtilsClass()
142
+ handle_files = util.handle_files
143
  data = cast(Dict[str, Any], serializer.validated_data)
144
  self.serializer = data
145
 
146
+ gerar_documento_instance = GerarDocumento(
147
+ obj, True, self.axiom_instance
148
+ )
149
 
150
+ # listaPDFs = [l["link_arquivo"] for l in data["files"]]
151
+ listaPDFs = json.loads(obj.files)
152
+ listaPDFs = [l["link_arquivo"] for l in listaPDFs]
153
 
154
  self.axiom_instance.send_axiom(f"listaPDFs: {listaPDFs}")
155
 
156
  all_PDFs_chunks, full_text_as_array = (
157
+ await handle_files.get_full_text_and_all_PDFs_chunks(
158
  listaPDFs,
159
  Splitter(obj.chunk_size, obj.chunk_overlap),
160
  False,
 
214
  )
215
  except Exception as e:
216
  print(f"ERRO GERAR EMENTA: {e}")
217
+ custom_exception_handler_without_api_handler(
218
  e, serializer, self.axiom_instance
219
  )
220
  raise
 
245
  f"COMEÇOU NOVA REQUISIÇÃO - request.data: {request.data}"
246
  )
247
  serializer = GerarDocumentoComPDFProprioSerializer(data=request.data)
248
+ util = UtilsClass()
249
+ handle_files = util.handle_files
250
  if serializer.is_valid(raise_exception=True):
251
  data = cast(Dict[str, Any], serializer.validated_data)
252
  self.axiom_instance.send_axiom(f"data: {data}")
 
254
  serializer_obj = serializer.get_obj()
255
 
256
  gerar_documento_instance = GerarDocumento(
257
+ serializer_obj, False, self.axiom_instance
258
  )
259
 
260
  listaPDFs = handle_pdf_files_from_serializer(
 
263
  self.axiom_instance.send_axiom(f"listaPDFs: {listaPDFs}")
264
 
265
  all_PDFs_chunks, full_text_as_array = (
266
+ await handle_files.get_full_text_and_all_PDFs_chunks(
267
  listaPDFs,
268
  Splitter(serializer_obj.chunk_size, serializer_obj.chunk_overlap),
269
  False,
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
setup/installed_apps.py CHANGED
@@ -16,4 +16,5 @@ INSTALLED_APPS = config_apps + [
16
  "modelos_usuarios",
17
  "ragas_api",
18
  "gerar_documento",
 
19
  ]
 
16
  "modelos_usuarios",
17
  "ragas_api",
18
  "gerar_documento",
19
+ "simple_llm",
20
  ]
setup/urls.py CHANGED
@@ -19,4 +19,5 @@ urlpatterns = config_urls + [
19
  path("", include("gerar_documento.urls")),
20
  path("", include("ragas_api.urls")),
21
  path("", include("modelos_usuarios.urls")),
 
22
  ]
 
19
  path("", include("gerar_documento.urls")),
20
  path("", include("ragas_api.urls")),
21
  path("", include("modelos_usuarios.urls")),
22
+ path("", include("simple_llm.urls")),
23
  ]
simple_llm/__init__.py ADDED
File without changes
simple_llm/admin.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.contrib import admin
2
+
3
+ # Register your models here.
simple_llm/apps.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from django.apps import AppConfig
2
+
3
+
4
+ class SimpleLlmConfig(AppConfig):
5
+ default_auto_field = 'django.db.models.BigAutoField'
6
+ name = 'simple_llm'
simple_llm/migrations/__init__.py ADDED
File without changes
simple_llm/models.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.db import models
2
+
3
+ # Create your models here.
simple_llm/serializer.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import List, Optional, Union
3
+ from rest_framework import serializers
4
+ from _utils.gerar_documento_utils.prompts import (
5
+ prompt_gerar_documento,
6
+ )
7
+ from gerar_documento.serializer import FileInfoSerializer, FileInfoSerializerData
8
+ from gerar_documento.serializer_base import (
9
+ GerarDocumentoParametros,
10
+ GerarDocumentoParametrosData,
11
+ )
12
+ from setup.environment import default_model
13
+ from django.core.files.uploadedfile import UploadedFile
14
+
15
+ user_message = "What are the main points of this document?"
16
+
17
+
18
+ class SimpleLLMInitialSerializer(serializers.Serializer):
19
+ files = serializers.ListField(child=serializers.FileField(), required=False)
20
+ user_text = serializers.CharField(required=False, default=user_message)
21
+ model = serializers.CharField(required=False, default=default_model)
22
+ prompt = serializers.CharField(required=False, default=prompt_gerar_documento)
23
+ llm_ultimas_requests = serializers.CharField(
24
+ required=False, default="gemini-2.0-flash"
25
+ )
26
+
27
+
28
+ @dataclass
29
+ class SimpleLLMInitialSerializerData:
30
+ files: List[dict] = field(default_factory=list)
31
+ user_text: str = ""
32
+ model: str = default_model
33
+ prompt: str = ""
34
+ llm_ultimas_requests: str = "gemini-2.0-flash"
35
+
36
+
37
+ class SimpleLLMSerializer(SimpleLLMInitialSerializer):
38
+ files = serializers.ListField(child=FileInfoSerializer(), required=False)
39
+ bubble_editor_version = serializers.CharField(
40
+ required=False, default="version-test"
41
+ ) # Será o valor utilizado dentro da URL da requisição pro Bubble
42
+ doc_id = serializers.CharField(required=True)
43
+ form_response_id = serializers.CharField(required=True)
44
+ version = serializers.CharField(required=True)
45
+
46
+ def get_obj(self):
47
+ return SimpleSerializerData(**self.validated_data) # type: ignore
48
+
49
+
50
+ @dataclass
51
+ class SimpleSerializerData(SimpleLLMInitialSerializerData):
52
+ files: List[FileInfoSerializerData] = field(default_factory=list)
53
+ bubble_editor_version: str = "version-test"
54
+
55
+ doc_id: str = ""
56
+ form_response_id: str = ""
57
+ version: str = ""
simple_llm/tests.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from django.test import TestCase
2
+
3
+ # Create your tests here.
simple_llm/urls.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.urls import path
2
+
3
+ from simple_llm.views import SimpleLLMView
4
+
5
+ urlpatterns = [
6
+ path(
7
+ "llm",
8
+ SimpleLLMView.as_view(),
9
+ name="simple-llm",
10
+ ),
11
+ ]
simple_llm/views.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
3
+ from _utils.gerar_documento import gerar_documento
4
+ from _utils.gerar_documento_utils.GerarDocumento import (
5
+ GerarDocumento,
6
+ GerarDocumentoUtils,
7
+ )
8
+ from _utils.langchain_utils.Prompt_class import Prompt
9
+ from _utils.utils import convert_markdown_to_HTML
10
+ from setup.logging import Axiom
11
+ from setup.easy_imports import (
12
+ Response,
13
+ AsyncAPIView,
14
+ extend_schema,
15
+ )
16
+ from simple_llm.serializer import SimpleLLMSerializer
17
+
18
+
19
+ class SimpleLLMView(AsyncAPIView):
20
+ # parser_classes = [MultiPartParser]
21
+ serializer = {}
22
+ axiom_instance = Axiom()
23
+
24
+ @extend_schema(
25
+ request=SimpleLLMSerializer,
26
+ )
27
+ async def post(self, request):
28
+ try:
29
+ self.axiom_instance.generate_new_uuid()
30
+ print(f"\n\nDATA E HORA DA REQUISIÇÃO: {datetime.now()}")
31
+ self.axiom_instance.send_axiom(
32
+ f"COMEÇOU NOVA REQUISIÇÃO - request.data: {request.data}"
33
+ )
34
+ serializer = SimpleLLMSerializer(data=request.data)
35
+ if serializer.is_valid(raise_exception=True):
36
+ obj = serializer.get_obj() # type: ignore
37
+ if not serializer.validated_data:
38
+ raise ValueError("Erro no validated_data")
39
+
40
+ self.serializer = obj
41
+
42
+ listaPDFs = [l.link_arquivo for l in obj.files]
43
+ self.axiom_instance.send_axiom(f"listaPDFs: {listaPDFs}")
44
+
45
+ summarizer = GerarDocumentoUtils(self.axiom_instance)
46
+
47
+ prompt_instance = Prompt()
48
+ prompt = prompt_instance.create_and_invoke_prompt(
49
+ obj.prompt,
50
+ dynamic_dict={"context": obj.user_text},
51
+ )
52
+
53
+ resposta_llm = (
54
+ await summarizer.checar_se_resposta_vazia_do_documento_final(
55
+ obj.llm_ultimas_requests, prompt.to_string()
56
+ )
57
+ )
58
+ self.axiom_instance.send_axiom(f"resposta_llm: {resposta_llm}")
59
+
60
+ texto_completo_como_html = convert_markdown_to_HTML(
61
+ resposta_llm
62
+ ).replace("resposta_segunda_etapa:", "<br><br>")
63
+
64
+ self.axiom_instance.send_axiom(
65
+ f"texto_completo_como_html: {texto_completo_como_html}"
66
+ )
67
+
68
+ return Response({"resposta": texto_completo_como_html})
69
+ except Exception as e:
70
+ custom_exception_handler_without_api_handler(
71
+ e, serializer, self.axiom_instance
72
+ )
73
+ raise