luanpoppe commited on
Commit
aae4d3d
·
1 Parent(s): 5fde427

feat: melhorando refatoração do gerar_documento

Browse files
_utils/gerar_documento.py CHANGED
@@ -1,30 +1,11 @@
1
  import os
2
- from langchain_core.messages import HumanMessage
3
- from typing import Any, Union, cast
4
- from _utils.Utils_Class import UtilsClass
5
- from _utils.axiom_logs import AxiomLogs
6
- from _utils.langchain_utils.LLM_class import LLM
7
- from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_final
8
  from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
9
  from rest_framework.response import Response
10
  from _utils.gerar_documento_utils.GerarDocumento import (
11
  GerarDocumento,
12
  )
13
- from _utils.gerar_documento_utils.contextual_retriever import (
14
- ContextualRetriever,
15
- )
16
- from _utils.gerar_documento_utils.utils import (
17
- generate_document_title,
18
- gerar_resposta_compilada,
19
- get_response_from_auxiliar_contextual_prompt,
20
- )
21
- from _utils.models.gerar_documento import (
22
- RetrievalConfig,
23
- )
24
- import markdown
25
 
26
- from _utils.langchain_utils.Prompt_class import Prompt
27
- from _utils.utils import convert_markdown_to_HTML
28
  from gerar_documento.serializer import (
29
  GerarDocumentoComPDFProprioSerializer,
30
  GerarDocumentoComPDFProprioSerializerData,
@@ -48,20 +29,12 @@ async def gerar_documento(
48
  isBubble=False,
49
  ):
50
  try:
51
- axiom = axiom_instance.send_axiom
52
- ax = AxiomLogs(axiom_instance)
53
- utils = UtilsClass()
54
  summarizer = GerarDocumento(serializer, isBubble, axiom_instance)
 
55
 
56
- all_PDFs_chunks, full_text_as_array = await summarizer.get_text_and_pdf_chunks()
57
-
58
- is_contextualized_chunk = serializer.should_have_contextual_chunks
59
 
60
- response_auxiliar_summary = await get_response_from_auxiliar_contextual_prompt(
61
- full_text_as_array
62
- )
63
- summarizer.resumo_auxiliar = response_auxiliar_summary
64
- ax.resumo_inicial_processo(response_auxiliar_summary)
65
 
66
  await summarizer.generate_chunks_processados()
67
 
@@ -85,7 +58,7 @@ async def gerar_documento(
85
  "texto_completo": summarizer.texto_completo_como_html,
86
  "titulo_do_documento": summarizer.titulo_do_documento,
87
  "resultado": structured_summaries,
88
- "parametros-utilizados": gerar_resposta_compilada(serializer),
89
  }
90
  except Exception as e:
91
  custom_exception_handler_without_api_handler(e, serializer, axiom_instance)
 
1
  import os
2
+ from typing import Any, Union
 
 
 
 
 
3
  from _utils.custom_exception_handler import custom_exception_handler_without_api_handler
4
  from rest_framework.response import Response
5
  from _utils.gerar_documento_utils.GerarDocumento import (
6
  GerarDocumento,
7
  )
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
9
  from gerar_documento.serializer import (
10
  GerarDocumentoComPDFProprioSerializer,
11
  GerarDocumentoComPDFProprioSerializerData,
 
29
  isBubble=False,
30
  ):
31
  try:
 
 
 
32
  summarizer = GerarDocumento(serializer, isBubble, axiom_instance)
33
+ summarizer.lista_pdfs = listaPDFs
34
 
35
+ await summarizer.get_text_and_pdf_chunks()
 
 
36
 
37
+ await summarizer.get_response_from_auxiliar_contextual_prompt()
 
 
 
 
38
 
39
  await summarizer.generate_chunks_processados()
40
 
 
58
  "texto_completo": summarizer.texto_completo_como_html,
59
  "titulo_do_documento": summarizer.titulo_do_documento,
60
  "resultado": structured_summaries,
61
+ "parametros-utilizados": summarizer.gerar_resposta_compilada(),
62
  }
63
  except Exception as e:
64
  custom_exception_handler_without_api_handler(e, serializer, axiom_instance)
_utils/gerar_documento_utils/GerarDocumento.py CHANGED
@@ -1,5 +1,5 @@
1
  from dataclasses import dataclass
2
- import os
3
  from typing import Any, List, Dict, Literal, Tuple, Optional, Union, cast
4
 
5
  from pydantic import SecretStr
@@ -9,6 +9,7 @@ from _utils.bubble_integrations.enviar_resposta_final import enviar_resposta_fin
9
  from _utils.gerar_documento_utils.contextual_retriever import ContextualRetriever
10
  from _utils.gerar_documento_utils.llm_calls import agemini_answer
11
  from _utils.gerar_documento_utils.prompts import (
 
12
  prompt_gerar_query_dinamicamente,
13
  prompt_para_gerar_titulo,
14
  )
@@ -40,6 +41,7 @@ from _utils.langchain_utils.Splitter_class import Splitter
40
  import time
41
  from setup.tokens import openai_api_key, cohere_api_key
42
  from setup.logging import Axiom
 
43
 
44
 
45
  def reciprocal_rank_fusion(result_lists, weights=None):
@@ -124,6 +126,10 @@ class GerarDocumento:
124
  structured_output: List[Any]
125
  texto_completo_como_html: str
126
  titulo_do_documento: str
 
 
 
 
127
 
128
  def __init__(
129
  self,
@@ -133,6 +139,7 @@ class GerarDocumento:
133
  isBubble: bool,
134
  axiom_instance: Axiom,
135
  ):
 
136
  self.config = self.gerar_documento_utils.create_retrieval_config(serializer)
137
  self.logger = logging.getLogger(__name__)
138
  # self.prompt_auxiliar = prompt_auxiliar
@@ -188,8 +195,10 @@ class GerarDocumento:
188
  else self.all_PDFs_chunks
189
  )
190
  self.chunks_processados = chunks_processados
191
- self.ax.chunks_inicialmente(chunks_processados)
192
- return chunks_processados
 
 
193
 
194
  async def generate_query_for_vector_store(self):
195
  prompt_para_gerar_query_dinamico = prompt_gerar_query_dinamicamente(
@@ -542,3 +551,77 @@ class GerarDocumento:
542
  self.axiom_instance.send_axiom(f"RESULTADO ETAPA 3: {documento_gerado}")
543
 
544
  return texto_final_juntando_as_etapas
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from dataclasses import dataclass
2
+ from langchain_core.messages import HumanMessage
3
  from typing import Any, List, Dict, Literal, Tuple, Optional, Union, cast
4
 
5
  from pydantic import SecretStr
 
9
  from _utils.gerar_documento_utils.contextual_retriever import ContextualRetriever
10
  from _utils.gerar_documento_utils.llm_calls import agemini_answer
11
  from _utils.gerar_documento_utils.prompts import (
12
+ create_prompt_auxiliar_do_contextual_prompt,
13
  prompt_gerar_query_dinamicamente,
14
  prompt_para_gerar_titulo,
15
  )
 
41
  import time
42
  from setup.tokens import openai_api_key, cohere_api_key
43
  from setup.logging import Axiom
44
+ import tiktoken
45
 
46
 
47
  def reciprocal_rank_fusion(result_lists, weights=None):
 
126
  structured_output: List[Any]
127
  texto_completo_como_html: str
128
  titulo_do_documento: str
129
+ encoding_tiktoken = tiktoken.get_encoding("cl100k_base")
130
+ serializer: Union[
131
+ GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
132
+ ]
133
 
134
  def __init__(
135
  self,
 
139
  isBubble: bool,
140
  axiom_instance: Axiom,
141
  ):
142
+ self.serializer = serializer
143
  self.config = self.gerar_documento_utils.create_retrieval_config(serializer)
144
  self.logger = logging.getLogger(__name__)
145
  # self.prompt_auxiliar = prompt_auxiliar
 
195
  else self.all_PDFs_chunks
196
  )
197
  self.chunks_processados = chunks_processados
198
+ if len(self.chunks_processados) == 0:
199
+ self.chunks_processados = self.all_PDFs_chunks
200
+ self.ax.chunks_inicialmente(self.chunks_processados)
201
+ return self.chunks_processados
202
 
203
  async def generate_query_for_vector_store(self):
204
  prompt_para_gerar_query_dinamico = prompt_gerar_query_dinamicamente(
 
551
  self.axiom_instance.send_axiom(f"RESULTADO ETAPA 3: {documento_gerado}")
552
 
553
  return texto_final_juntando_as_etapas
554
+
555
+ # Esta função gera a resposta que será usada em cada um das requisições de cada chunk
556
+ async def get_response_from_auxiliar_contextual_prompt(self):
557
+ llms = LLM()
558
+ responses = []
559
+
560
+ current_chunk = []
561
+ current_token_count = 0
562
+ chunk_counter = 1
563
+
564
+ for part in self.full_text_as_array:
565
+ part_tokens = len(self.encoding_tiktoken.encode(part))
566
+
567
+ # Check if adding this part would EXCEED the limit
568
+ if current_token_count + part_tokens > 600000:
569
+ # Process the accumulated chunk before it exceeds the limit
570
+ chunk_text = "".join(current_chunk)
571
+ print(
572
+ f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
573
+ )
574
+
575
+ prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
576
+ response = await llms.google_gemini().ainvoke(
577
+ [HumanMessage(content=prompt)]
578
+ )
579
+ responses.append(response.content)
580
+
581
+ # Start new chunk with current part
582
+ current_chunk = [part]
583
+ current_token_count = part_tokens
584
+ chunk_counter += 1
585
+ else:
586
+ # Safe to add to current chunk
587
+ current_chunk.append(part)
588
+ current_token_count += part_tokens
589
+
590
+ # Process the final remaining chunk
591
+ if current_chunk:
592
+ chunk_text = "".join(current_chunk)
593
+ print(
594
+ f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
595
+ )
596
+ prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
597
+ response = await llms.google_gemini().ainvoke(
598
+ [HumanMessage(content=prompt)]
599
+ )
600
+ responses.append(response.content)
601
+
602
+ self.resumo_auxiliar = "".join(responses)
603
+ self.ax.resumo_inicial_processo(self.resumo_auxiliar)
604
+
605
+ return self.resumo_auxiliar
606
+
607
+ def gerar_resposta_compilada(self):
608
+ serializer = self.serializer
609
+ return {
610
+ "num_chunks_retrieval": serializer.num_chunks_retrieval,
611
+ "embedding_weight": serializer.embedding_weight,
612
+ "bm25_weight": serializer.bm25_weight,
613
+ "context_window": serializer.context_window,
614
+ "chunk_overlap": serializer.chunk_overlap,
615
+ "num_k_rerank": serializer.num_k_rerank,
616
+ "model_cohere_rerank": serializer.model_cohere_rerank,
617
+ "more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
618
+ "claude_context_model": serializer.claude_context_model,
619
+ "gpt_temperature": serializer.gpt_temperature,
620
+ "user_message": serializer.user_message,
621
+ "model": serializer.model,
622
+ "hf_embedding": serializer.hf_embedding,
623
+ "chunk_size": serializer.chunk_size,
624
+ "chunk_overlap": serializer.chunk_overlap,
625
+ # "prompt_auxiliar": serializer.prompt_auxiliar,
626
+ "prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
627
+ }
_utils/gerar_documento_utils/utils.py CHANGED
@@ -1,92 +1,10 @@
1
- from typing import Any, List, Tuple, Union
2
- from langchain_core.documents import Document
3
- from langchain_core.messages import HumanMessage
4
-
5
  from _utils.gerar_documento_utils.llm_calls import agemini_answer
6
- from _utils.langchain_utils.Splitter_class import Splitter
7
- from _utils.langchain_utils.LLM_class import LLM
8
- from _utils.gerar_documento_utils.prompts import (
9
- create_prompt_auxiliar_do_contextual_prompt,
10
- prompt_para_gerar_titulo,
11
- )
12
-
13
- from _utils.models.gerar_documento import DocumentChunk
14
- from gerar_documento.serializer import GerarDocumentoSerializerData
15
  import tiktoken
16
 
17
  encoding = tiktoken.get_encoding("cl100k_base")
18
 
19
 
20
- def gerar_resposta_compilada(serializer: Union[GerarDocumentoSerializerData, Any]):
21
- return {
22
- "num_chunks_retrieval": serializer.num_chunks_retrieval,
23
- "embedding_weight": serializer.embedding_weight,
24
- "bm25_weight": serializer.bm25_weight,
25
- "context_window": serializer.context_window,
26
- "chunk_overlap": serializer.chunk_overlap,
27
- "num_k_rerank": serializer.num_k_rerank,
28
- "model_cohere_rerank": serializer.model_cohere_rerank,
29
- "more_initial_chunks_for_reranking": serializer.more_initial_chunks_for_reranking,
30
- "claude_context_model": serializer.claude_context_model,
31
- "gpt_temperature": serializer.gpt_temperature,
32
- "user_message": serializer.user_message,
33
- "model": serializer.model,
34
- "hf_embedding": serializer.hf_embedding,
35
- "chunk_size": serializer.chunk_size,
36
- "chunk_overlap": serializer.chunk_overlap,
37
- # "prompt_auxiliar": serializer.prompt_auxiliar,
38
- "prompt_gerar_documento": serializer.prompt_gerar_documento[0:200],
39
- }
40
-
41
-
42
- # Esta função gera a resposta que será usada em cada um das requisições de cada chunk
43
- async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
44
- llms = LLM()
45
- responses = []
46
-
47
- current_chunk = []
48
- current_token_count = 0
49
- chunk_counter = 1
50
-
51
- for part in full_text_as_array:
52
- part_tokens = len(encoding.encode(part))
53
-
54
- # Check if adding this part would EXCEED the limit
55
- if current_token_count + part_tokens > 600000:
56
- # Process the accumulated chunk before it exceeds the limit
57
- chunk_text = "".join(current_chunk)
58
- print(
59
- f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
60
- )
61
-
62
- prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
63
- response = await llms.google_gemini().ainvoke(
64
- [HumanMessage(content=prompt)]
65
- )
66
- responses.append(response.content)
67
-
68
- # Start new chunk with current part
69
- current_chunk = [part]
70
- current_token_count = part_tokens
71
- chunk_counter += 1
72
- else:
73
- # Safe to add to current chunk
74
- current_chunk.append(part)
75
- current_token_count += part_tokens
76
-
77
- # Process the final remaining chunk
78
- if current_chunk:
79
- chunk_text = "".join(current_chunk)
80
- print(
81
- f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
82
- )
83
- prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
84
- response = await llms.google_gemini().ainvoke([HumanMessage(content=prompt)])
85
- responses.append(response.content)
86
-
87
- return "".join(responses)
88
-
89
-
90
  def split_text_by_tokens(full_text: str):
91
  tokens = encoding.encode(full_text)
92
  max_tokens = 600000
 
 
 
 
 
1
  from _utils.gerar_documento_utils.llm_calls import agemini_answer
2
+ from _utils.gerar_documento_utils.prompts import prompt_para_gerar_titulo
 
 
 
 
 
 
 
 
3
  import tiktoken
4
 
5
  encoding = tiktoken.get_encoding("cl100k_base")
6
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def split_text_by_tokens(full_text: str):
9
  tokens = encoding.encode(full_text)
10
  max_tokens = 600000
gerar_documento/views.py CHANGED
@@ -7,7 +7,6 @@ from _utils.gerar_documento_utils.GerarDocumento import GerarDocumento
7
  from _utils.langchain_utils.LLM_class import LLM
8
  from _utils.gerar_documento_utils.utils import (
9
  generate_document_title,
10
- gerar_resposta_compilada,
11
  split_text_by_tokens,
12
  )
13
  from _utils.langchain_utils.Prompt_class import Prompt
 
7
  from _utils.langchain_utils.LLM_class import LLM
8
  from _utils.gerar_documento_utils.utils import (
9
  generate_document_title,
 
10
  split_text_by_tokens,
11
  )
12
  from _utils.langchain_utils.Prompt_class import Prompt