Spaces:
Running
Running
luanpoppe
commited on
Commit
·
aae4d3d
1
Parent(s):
5fde427
feat: melhorando refatoração do gerar_documento
Browse files
_utils/gerar_documento.py
CHANGED
@@ -1,30 +1,11 @@
|
|
1 |
import os
|
2 |
-
from
|
3 |
-
from typing import Any, Union, cast
|
4 |
-
from _utils.Utils_Class import UtilsClass
|
5 |
-
from _utils.axiom_logs import AxiomLogs
|
6 |
-
from _utils.langchain_utils.LLM_class import LLM
|
7 |
-
from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_final
|
8 |
from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
|
9 |
from rest_framework.response import Response
|
10 |
from _utils.gerar_documento_utils.GerarDocumento import (
|
11 |
GerarDocumento,
|
12 |
)
|
13 |
-
from _utils.gerar_documento_utils.contextual_retriever import (
|
14 |
-
ContextualRetriever,
|
15 |
-
)
|
16 |
-
from _utils.gerar_documento_utils.utils import (
|
17 |
-
generate_document_title,
|
18 |
-
gerar_resposta_compilada,
|
19 |
-
get_response_from_auxiliar_contextual_prompt,
|
20 |
-
)
|
21 |
-
from _utils.models.gerar_documento import (
|
22 |
-
RetrievalConfig,
|
23 |
-
)
|
24 |
-
import markdown
|
25 |
|
26 |
-
from _utils.langchain_utils.Prompt_class import Prompt
|
27 |
-
from _utils.utils import convert_markdown_to_HTML
|
28 |
from gerar_documento.serializer import (
|
29 |
GerarDocumentoComPDFProprioSerializer,
|
30 |
GerarDocumentoComPDFProprioSerializerData,
|
@@ -48,20 +29,12 @@ async def gerar_documento(
|
|
48 |
isBubble=False,
|
49 |
):
|
50 |
try:
|
51 |
-
axiom = axiom_instance.send_axiom
|
52 |
-
ax = AxiomLogs(axiom_instance)
|
53 |
-
utils = UtilsClass()
|
54 |
summarizer = GerarDocumento(serializer, isBubble, axiom_instance)
|
|
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
is_contextualized_chunk = serializer.should_have_contextual_chunks
|
59 |
|
60 |
-
|
61 |
-
full_text_as_array
|
62 |
-
)
|
63 |
-
summarizer.resumo_auxiliar = response_auxiliar_summary
|
64 |
-
ax.resumo_inicial_processo(response_auxiliar_summary)
|
65 |
|
66 |
await summarizer.generate_chunks_processados()
|
67 |
|
@@ -85,7 +58,7 @@ async def gerar_documento(
|
|
85 |
"texto_completo": summarizer.texto_completo_como_html,
|
86 |
"titulo_do_documento": summarizer.titulo_do_documento,
|
87 |
"resultado": structured_summaries,
|
88 |
-
"parametros-utilizados": gerar_resposta_compilada(
|
89 |
}
|
90 |
except Exception as e:
|
91 |
custom_exception_handler_without_api_handler(e, serializer, axiom_instance)
|
|
|
1 |
import os
|
2 |
+
from typing import Any, Union
|
|
|
|
|
|
|
|
|
|
|
3 |
from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
|
4 |
from rest_framework.response import Response
|
5 |
from _utils.gerar_documento_utils.GerarDocumento import (
|
6 |
GerarDocumento,
|
7 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
|
|
|
|
9 |
from gerar_documento.serializer import (
|
10 |
GerarDocumentoComPDFProprioSerializer,
|
11 |
GerarDocumentoComPDFProprioSerializerData,
|
|
|
29 |
isBubble=False,
|
30 |
):
|
31 |
try:
|
|
|
|
|
|
|
32 |
summarizer = GerarDocumento(serializer, isBubble, axiom_instance)
|
33 |
+
summarizer.lista_pdfs = listaPDFs
|
34 |
|
35 |
+
await summarizer.get_text_and_pdf_chunks()
|
|
|
|
|
36 |
|
37 |
+
await summarizer.get_response_from_auxiliar_contextual_prompt()
|
|
|
|
|
|
|
|
|
38 |
|
39 |
await summarizer.generate_chunks_processados()
|
40 |
|
|
|
58 |
"texto_completo": summarizer.texto_completo_como_html,
|
59 |
"titulo_do_documento": summarizer.titulo_do_documento,
|
60 |
"resultado": structured_summaries,
|
61 |
+
"parametros-utilizados": summarizer.gerar_resposta_compilada(),
|
62 |
}
|
63 |
except Exception as e:
|
64 |
custom_exception_handler_without_api_handler(e, serializer, axiom_instance)
|
_utils/gerar_documento_utils/GerarDocumento.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
-
import
|
3 |
from typing import Any, List, Dict, Literal, Tuple, Optional, Union, cast
|
4 |
|
5 |
from pydantic import SecretStr
|
@@ -9,6 +9,7 @@ from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_fin
|
|
9 |
from _utils.gerar_documento_utils.contextual_retriever import ContextualRetriever
|
10 |
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
11 |
from _utils.gerar_documento_utils.prompts import (
|
|
|
12 |
prompt_gerar_query_dinamicamente,
|
13 |
prompt_para_gerar_titulo,
|
14 |
)
|
@@ -40,6 +41,7 @@ from _utils.langchain_utils.Splitter_class import Splitter
|
|
40 |
import time
|
41 |
from setup.tokens import openai_api_key, cohere_api_key
|
42 |
from setup.logging import Axiom
|
|
|
43 |
|
44 |
|
45 |
def reciprocal_rank_fusion(result_lists, weights=None):
|
@@ -124,6 +126,10 @@ class GerarDocumento:
|
|
124 |
structured_output: List[Any]
|
125 |
texto_completo_como_html: str
|
126 |
titulo_do_documento: str
|
|
|
|
|
|
|
|
|
127 |
|
128 |
def __init__(
|
129 |
self,
|
@@ -133,6 +139,7 @@ class GerarDocumento:
|
|
133 |
isBubble: bool,
|
134 |
axiom_instance: Axiom,
|
135 |
):
|
|
|
136 |
self.config = self.gerar_documento_utils.create_retrieval_config(serializer)
|
137 |
self.logger = logging.getLogger(__name__)
|
138 |
# self.prompt_auxiliar = prompt_auxiliar
|
@@ -188,8 +195,10 @@ class GerarDocumento:
|
|
188 |
else self.all_PDFs_chunks
|
189 |
)
|
190 |
self.chunks_processados = chunks_processados
|
191 |
-
self.
|
192 |
-
|
|
|
|
|
193 |
|
194 |
async def generate_query_for_vector_store(self):
|
195 |
prompt_para_gerar_query_dinamico = prompt_gerar_query_dinamicamente(
|
@@ -542,3 +551,77 @@ class GerarDocumento:
|
|
542 |
self.axiom_instance.send_axiom(f"RESULTADO ETAPA 3: {documento_gerado}")
|
543 |
|
544 |
return texto_final_juntando_as_etapas
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from dataclasses import dataclass
|
2 |
+
from langchain_core.messages import HumanMessage
|
3 |
from typing import Any, List, Dict, Literal, Tuple, Optional, Union, cast
|
4 |
|
5 |
from pydantic import SecretStr
|
|
|
9 |
from _utils.gerar_documento_utils.contextual_retriever import ContextualRetriever
|
10 |
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
11 |
from _utils.gerar_documento_utils.prompts import (
|
12 |
+
create_prompt_auxiliar_do_contextual_prompt,
|
13 |
prompt_gerar_query_dinamicamente,
|
14 |
prompt_para_gerar_titulo,
|
15 |
)
|
|
|
41 |
import time
|
42 |
from setup.tokens import openai_api_key, cohere_api_key
|
43 |
from setup.logging import Axiom
|
44 |
+
import tiktoken
|
45 |
|
46 |
|
47 |
def reciprocal_rank_fusion(result_lists, weights=None):
|
|
|
126 |
structured_output: List[Any]
|
127 |
texto_completo_como_html: str
|
128 |
titulo_do_documento: str
|
129 |
+
encoding_tiktoken = tiktoken.get_encoding("cl100k_base")
|
130 |
+
serializer: Union[
|
131 |
+
GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
|
132 |
+
]
|
133 |
|
134 |
def __init__(
|
135 |
self,
|
|
|
139 |
isBubble: bool,
|
140 |
axiom_instance: Axiom,
|
141 |
):
|
142 |
+
self.serializer = serializer
|
143 |
self.config = self.gerar_documento_utils.create_retrieval_config(serializer)
|
144 |
self.logger = logging.getLogger(__name__)
|
145 |
# self.prompt_auxiliar = prompt_auxiliar
|
|
|
195 |
else self.all_PDFs_chunks
|
196 |
)
|
197 |
self.chunks_processados = chunks_processados
|
198 |
+
if len(self.chunks_processados) == 0:
|
199 |
+
self.chunks_processados = self.all_PDFs_chunks
|
200 |
+
self.ax.chunks_inicialmente(self.chunks_processados)
|
201 |
+
return self.chunks_processados
|
202 |
|
203 |
async def generate_query_for_vector_store(self):
|
204 |
prompt_para_gerar_query_dinamico = prompt_gerar_query_dinamicamente(
|
|
|
551 |
self.axiom_instance.send_axiom(f"RESULTADO ETAPA 3: {documento_gerado}")
|
552 |
|
553 |
return texto_final_juntando_as_etapas
|
554 |
+
|
555 |
+
# Esta função gera a resposta que será usada em cada um das requisições de cada chunk
|
556 |
+
async def get_response_from_auxiliar_contextual_prompt(self):
|
557 |
+
llms = LLM()
|
558 |
+
responses = []
|
559 |
+
|
560 |
+
current_chunk = []
|
561 |
+
current_token_count = 0
|
562 |
+
chunk_counter = 1
|
563 |
+
|
564 |
+
for part in self.full_text_as_array:
|
565 |
+
part_tokens = len(self.encoding_tiktoken.encode(part))
|
566 |
+
|
567 |
+
# Check if adding this part would EXCEED the limit
|
568 |
+
if current_token_count + part_tokens > 600000:
|
569 |
+
# Process the accumulated chunk before it exceeds the limit
|
570 |
+
chunk_text = "".join(current_chunk)
|
571 |
+
print(
|
572 |
+
f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
|
573 |
+
)
|
574 |
+
|
575 |
+
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
576 |
+
response = await llms.google_gemini().ainvoke(
|
577 |
+
[HumanMessage(content=prompt)]
|
578 |
+
)
|
579 |
+
responses.append(response.content)
|
580 |
+
|
581 |
+
# Start new chunk with current part
|
582 |
+
current_chunk = [part]
|
583 |
+
current_token_count = part_tokens
|
584 |
+
chunk_counter += 1
|
585 |
+
else:
|
586 |
+
# Safe to add to current chunk
|
587 |
+
current_chunk.append(part)
|
588 |
+
current_token_count += part_tokens
|
589 |
+
|
590 |
+
# Process the final remaining chunk
|
591 |
+
if current_chunk:
|
592 |
+
chunk_text = "".join(current_chunk)
|
593 |
+
print(
|
594 |
+
f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
|
595 |
+
)
|
596 |
+
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
597 |
+
response = await llms.google_gemini().ainvoke(
|
598 |
+
[HumanMessage(content=prompt)]
|
599 |
+
)
|
600 |
+
responses.append(response.content)
|
601 |
+
|
602 |
+
self.resumo_auxiliar = "".join(responses)
|
603 |
+
self.ax.resumo_inicial_processo(self.resumo_auxiliar)
|
604 |
+
|
605 |
+
return self.resumo_auxiliar
|
606 |
+
|
607 |
+
def gerar_resposta_compilada(self):
|
608 |
+
serializer = self.serializer
|
609 |
+
return {
|
610 |
+
"num_chunks_retrieval": serializer.num_chunks_retrieval,
|
611 |
+
"embedding_weight": serializer.embedding_weight,
|
612 |
+
"bm25_weight": serializer.bm25_weight,
|
613 |
+
"context_window": serializer.context_window,
|
614 |
+
"chunk_overlap": serializer.chunk_overlap,
|
615 |
+
"num_k_rerank": serializer.num_k_rerank,
|
616 |
+
"model_cohere_rerank": serializer.model_cohere_rerank,
|
617 |
+
"more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
|
618 |
+
"claude_context_model": serializer.claude_context_model,
|
619 |
+
"gpt_temperature": serializer.gpt_temperature,
|
620 |
+
"user_message": serializer.user_message,
|
621 |
+
"model": serializer.model,
|
622 |
+
"hf_embedding": serializer.hf_embedding,
|
623 |
+
"chunk_size": serializer.chunk_size,
|
624 |
+
"chunk_overlap": serializer.chunk_overlap,
|
625 |
+
# "prompt_auxiliar": serializer.prompt_auxiliar,
|
626 |
+
"prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
|
627 |
+
}
|
_utils/gerar_documento_utils/utils.py
CHANGED
@@ -1,92 +1,10 @@
|
|
1 |
-
from typing import Any, List, Tuple, Union
|
2 |
-
from langchain_core.documents import Document
|
3 |
-
from langchain_core.messages import HumanMessage
|
4 |
-
|
5 |
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
6 |
-
from _utils.
|
7 |
-
from _utils.langchain_utils.LLM_class import LLM
|
8 |
-
from _utils.gerar_documento_utils.prompts import (
|
9 |
-
create_prompt_auxiliar_do_contextual_prompt,
|
10 |
-
prompt_para_gerar_titulo,
|
11 |
-
)
|
12 |
-
|
13 |
-
from _utils.models.gerar_documento import DocumentChunk
|
14 |
-
from gerar_documento.serializer import GerarDocumentoSerializerData
|
15 |
import tiktoken
|
16 |
|
17 |
encoding = tiktoken.get_encoding("cl100k_base")
|
18 |
|
19 |
|
20 |
-
def gerar_resposta_compilada(serializer: Union[GerarDocumentoSerializerData, Any]):
|
21 |
-
return {
|
22 |
-
"num_chunks_retrieval": serializer.num_chunks_retrieval,
|
23 |
-
"embedding_weight": serializer.embedding_weight,
|
24 |
-
"bm25_weight": serializer.bm25_weight,
|
25 |
-
"context_window": serializer.context_window,
|
26 |
-
"chunk_overlap": serializer.chunk_overlap,
|
27 |
-
"num_k_rerank": serializer.num_k_rerank,
|
28 |
-
"model_cohere_rerank": serializer.model_cohere_rerank,
|
29 |
-
"more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
|
30 |
-
"claude_context_model": serializer.claude_context_model,
|
31 |
-
"gpt_temperature": serializer.gpt_temperature,
|
32 |
-
"user_message": serializer.user_message,
|
33 |
-
"model": serializer.model,
|
34 |
-
"hf_embedding": serializer.hf_embedding,
|
35 |
-
"chunk_size": serializer.chunk_size,
|
36 |
-
"chunk_overlap": serializer.chunk_overlap,
|
37 |
-
# "prompt_auxiliar": serializer.prompt_auxiliar,
|
38 |
-
"prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
|
39 |
-
}
|
40 |
-
|
41 |
-
|
42 |
-
# Esta função gera a resposta que será usada em cada um das requisições de cada chunk
|
43 |
-
async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
|
44 |
-
llms = LLM()
|
45 |
-
responses = []
|
46 |
-
|
47 |
-
current_chunk = []
|
48 |
-
current_token_count = 0
|
49 |
-
chunk_counter = 1
|
50 |
-
|
51 |
-
for part in full_text_as_array:
|
52 |
-
part_tokens = len(encoding.encode(part))
|
53 |
-
|
54 |
-
# Check if adding this part would EXCEED the limit
|
55 |
-
if current_token_count + part_tokens > 600000:
|
56 |
-
# Process the accumulated chunk before it exceeds the limit
|
57 |
-
chunk_text = "".join(current_chunk)
|
58 |
-
print(
|
59 |
-
f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
|
60 |
-
)
|
61 |
-
|
62 |
-
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
63 |
-
response = await llms.google_gemini().ainvoke(
|
64 |
-
[HumanMessage(content=prompt)]
|
65 |
-
)
|
66 |
-
responses.append(response.content)
|
67 |
-
|
68 |
-
# Start new chunk with current part
|
69 |
-
current_chunk = [part]
|
70 |
-
current_token_count = part_tokens
|
71 |
-
chunk_counter += 1
|
72 |
-
else:
|
73 |
-
# Safe to add to current chunk
|
74 |
-
current_chunk.append(part)
|
75 |
-
current_token_count += part_tokens
|
76 |
-
|
77 |
-
# Process the final remaining chunk
|
78 |
-
if current_chunk:
|
79 |
-
chunk_text = "".join(current_chunk)
|
80 |
-
print(
|
81 |
-
f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
|
82 |
-
)
|
83 |
-
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
84 |
-
response = await llms.google_gemini().ainvoke([HumanMessage(content=prompt)])
|
85 |
-
responses.append(response.content)
|
86 |
-
|
87 |
-
return "".join(responses)
|
88 |
-
|
89 |
-
|
90 |
def split_text_by_tokens(full_text: str):
|
91 |
tokens = encoding.encode(full_text)
|
92 |
max_tokens = 600000
|
|
|
|
|
|
|
|
|
|
|
1 |
from _utils.gerar_documento_utils.llm_calls import agemini_answer
|
2 |
+
from _utils.gerar_documento_utils.prompts import prompt_para_gerar_titulo
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import tiktoken
|
4 |
|
5 |
encoding = tiktoken.get_encoding("cl100k_base")
|
6 |
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def split_text_by_tokens(full_text: str):
|
9 |
tokens = encoding.encode(full_text)
|
10 |
max_tokens = 600000
|
gerar_documento/views.py
CHANGED
@@ -7,7 +7,6 @@ from _utils.gerar_documento_utils.GerarDocumento import GerarDocumento
|
|
7 |
from _utils.langchain_utils.LLM_class import LLM
|
8 |
from _utils.gerar_documento_utils.utils import (
|
9 |
generate_document_title,
|
10 |
-
gerar_resposta_compilada,
|
11 |
split_text_by_tokens,
|
12 |
)
|
13 |
from _utils.langchain_utils.Prompt_class import Prompt
|
|
|
7 |
from _utils.langchain_utils.LLM_class import LLM
|
8 |
from _utils.gerar_documento_utils.utils import (
|
9 |
generate_document_title,
|
|
|
10 |
split_text_by_tokens,
|
11 |
)
|
12 |
from _utils.langchain_utils.Prompt_class import Prompt
|