luanpoppe commited on
Commit
a1f037d
·
1 Parent(s): 756fca0

feat: melhorando a instanciação de algumas classes de gerar documentos

Browse files
_utils/gerar_documento.py CHANGED
@@ -26,26 +26,11 @@ import markdown
26
 
27
  from _utils.langchain_utils.Prompt_class import Prompt
28
  from _utils.utils import convert_markdown_to_HTML
29
- from gerar_documento.serializer import GerarDocumentoSerializerData
30
-
31
-
32
- def reciprocal_rank_fusion(result_lists, weights=None):
33
- """Combine multiple ranked lists using reciprocal rank fusion"""
34
- fused_scores = {}
35
- num_lists = len(result_lists)
36
- if weights is None:
37
- weights = [1.0] * num_lists
38
-
39
- for i in range(num_lists):
40
- for doc_id, score in result_lists[i]:
41
- if doc_id not in fused_scores:
42
- fused_scores[doc_id] = 0
43
- fused_scores[doc_id] += weights[i] * score
44
-
45
- # Sort by score in descending order
46
- sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
47
-
48
- return sorted_results
49
 
50
 
51
  os.environ["LANGCHAIN_TRACING_V2"] = "true"
@@ -55,37 +40,18 @@ os.environ["LANGCHAIN_PROJECT"] = "VELLA"
55
 
56
 
57
  async def gerar_documento(
58
- serializer: Union[GerarDocumentoSerializerData, Any], listaPDFs, isBubble=False
 
 
 
 
59
  ):
60
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
61
  try:
62
- # Configuration
63
- config = RetrievalConfig(
64
- num_chunks=serializer.num_chunks_retrieval,
65
- embedding_weight=serializer.embedding_weight,
66
- bm25_weight=serializer.bm25_weight,
67
- context_window=serializer.context_window,
68
- chunk_overlap=serializer.chunk_overlap,
69
- )
70
-
71
- contextual_retriever = ContextualRetriever(
72
- config, serializer.claude_context_model
73
- )
74
 
75
  # Initialize enhanced summarizer
76
- summarizer = GerarDocumento(
77
- config=config,
78
- embedding_model=serializer.hf_embedding,
79
- chunk_overlap=serializer.chunk_overlap,
80
- chunk_size=serializer.chunk_size,
81
- num_k_rerank=serializer.num_k_rerank,
82
- model_cohere_rerank=serializer.model_cohere_rerank,
83
- # prompt_auxiliar=serializer.prompt_auxiliar,
84
- gpt_model=serializer.model,
85
- gpt_temperature=serializer.gpt_temperature,
86
- prompt_gerar_documento=serializer.prompt_gerar_documento,
87
- reciprocal_rank_fusion=reciprocal_rank_fusion,
88
- )
89
 
90
  all_PDFs_chunks, full_text_as_array, full_text_as_string = (
91
  await get_full_text_and_all_PDFs_chunks(
@@ -173,9 +139,9 @@ async def gerar_documento(
173
  if isBubble:
174
  print("COMEÇANDO A REQUISIÇÃO FINAL PARA O BUBBLE")
175
  enviar_resposta_final(
176
- serializer.doc_id,
177
- serializer.form_response_id,
178
- serializer.version,
179
  texto_completo_como_html,
180
  False,
181
  cast(str, titulo_do_documento),
 
26
 
27
  from _utils.langchain_utils.Prompt_class import Prompt
28
  from _utils.utils import convert_markdown_to_HTML
29
+ from gerar_documento.serializer import (
30
+ GerarDocumentoComPDFProprioSerializer,
31
+ GerarDocumentoComPDFProprioSerializerData,
32
+ GerarDocumentoSerializerData,
33
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  os.environ["LANGCHAIN_TRACING_V2"] = "true"
 
40
 
41
 
42
  async def gerar_documento(
43
+ serializer: Union[
44
+ GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
45
+ ],
46
+ listaPDFs,
47
+ isBubble=False,
48
  ):
49
  """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
50
  try:
51
+ contextual_retriever = ContextualRetriever(serializer)
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Initialize enhanced summarizer
54
+ summarizer = GerarDocumento(serializer)
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  all_PDFs_chunks, full_text_as_array, full_text_as_string = (
57
  await get_full_text_and_all_PDFs_chunks(
 
139
  if isBubble:
140
  print("COMEÇANDO A REQUISIÇÃO FINAL PARA O BUBBLE")
141
  enviar_resposta_final(
142
+ serializer.doc_id, # type: ignore
143
+ serializer.form_response_id, # type: ignore
144
+ serializer.version, # type: ignore
145
  texto_completo_como_html,
146
  False,
147
  cast(str, titulo_do_documento),
_utils/gerar_relatorio_modelo_usuario/GerarDocumento.py CHANGED
@@ -1,9 +1,13 @@
1
  import os
2
- from typing import List, Dict, Tuple, Optional, cast
3
 
4
  from pydantic import SecretStr
5
  from _utils.langchain_utils.LLM_class import LLM
6
  from _utils.langchain_utils.Vector_store_class import VectorStore
 
 
 
 
7
  from setup.easy_imports import (
8
  Chroma,
9
  ChatOpenAI,
@@ -23,6 +27,25 @@ from cohere import Client
23
  from _utils.langchain_utils.Splitter_class import Splitter
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class GerarDocumento:
27
  openai_api_key = os.environ.get("OPENAI_API_KEY", "")
28
  cohere_api_key = os.environ.get("COHERE_API_KEY", "")
@@ -30,35 +53,31 @@ class GerarDocumento:
30
 
31
  def __init__(
32
  self,
33
- config: RetrievalConfig,
34
- embedding_model,
35
- chunk_size,
36
- chunk_overlap,
37
- num_k_rerank,
38
- model_cohere_rerank,
39
- # prompt_auxiliar,
40
- gpt_model,
41
- gpt_temperature,
42
- # id_modelo_do_usuario,
43
- prompt_gerar_documento,
44
- reciprocal_rank_fusion,
45
  ):
46
- self.config = config
 
 
 
 
 
 
47
  self.logger = logging.getLogger(__name__)
48
  # self.prompt_auxiliar = prompt_auxiliar
49
- self.gpt_model = gpt_model
50
- self.gpt_temperature = gpt_temperature
51
- self.prompt_gerar_documento = prompt_gerar_documento
52
- self.reciprocal_rank_fusion = reciprocal_rank_fusion
53
 
54
  self.openai_api_key = self.openai_api_key
55
  self.cohere_client = Client(self.cohere_api_key)
56
- self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
57
- self.num_k_rerank = num_k_rerank
58
- self.model_cohere_rerank = model_cohere_rerank
59
- self.splitter = Splitter(chunk_size, chunk_overlap)
60
 
61
- self.vector_store = VectorStore(embedding_model)
62
 
63
  def retrieve_with_rank_fusion(
64
  self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
@@ -101,11 +120,9 @@ class GerarDocumento:
101
  result_lists = [embedding_list, bm25_list]
102
  weights = [self.config.embedding_weight, self.config.bm25_weight]
103
 
104
- combined_results = self.reciprocal_rank_fusion(
105
- result_lists, weights=weights
106
- )
107
 
108
- return combined_results
109
 
110
  except Exception as e:
111
  self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
@@ -189,7 +206,7 @@ class GerarDocumento:
189
  # self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
190
 
191
  prompt_gerar_documento = PromptTemplate(
192
- template=self.prompt_gerar_documento,
193
  input_variables=["context"],
194
  )
195
 
 
1
  import os
2
+ from typing import Any, List, Dict, Tuple, Optional, Union, cast
3
 
4
  from pydantic import SecretStr
5
  from _utils.langchain_utils.LLM_class import LLM
6
  from _utils.langchain_utils.Vector_store_class import VectorStore
7
+ from gerar_documento.serializer import (
8
+ GerarDocumentoComPDFProprioSerializerData,
9
+ GerarDocumentoSerializerData,
10
+ )
11
  from setup.easy_imports import (
12
  Chroma,
13
  ChatOpenAI,
 
27
  from _utils.langchain_utils.Splitter_class import Splitter
28
 
29
 
30
+ def reciprocal_rank_fusion(result_lists, weights=None):
31
+ """Combine multiple ranked lists using reciprocal rank fusion"""
32
+ fused_scores = {}
33
+ num_lists = len(result_lists)
34
+ if weights is None:
35
+ weights = [1.0] * num_lists
36
+
37
+ for i in range(num_lists):
38
+ for doc_id, score in result_lists[i]:
39
+ if doc_id not in fused_scores:
40
+ fused_scores[doc_id] = 0
41
+ fused_scores[doc_id] += weights[i] * score
42
+
43
+ # Sort by score in descending order
44
+ sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
45
+
46
+ return sorted_results
47
+
48
+
49
  class GerarDocumento:
50
  openai_api_key = os.environ.get("OPENAI_API_KEY", "")
51
  cohere_api_key = os.environ.get("COHERE_API_KEY", "")
 
53
 
54
  def __init__(
55
  self,
56
+ serializer: Union[
57
+ GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
58
+ ],
 
 
 
 
 
 
 
 
 
59
  ):
60
+ self.config = RetrievalConfig(
61
+ num_chunks=serializer.num_chunks_retrieval,
62
+ embedding_weight=serializer.embedding_weight,
63
+ bm25_weight=serializer.bm25_weight,
64
+ context_window=serializer.context_window,
65
+ chunk_overlap=serializer.chunk_overlap,
66
+ )
67
  self.logger = logging.getLogger(__name__)
68
  # self.prompt_auxiliar = prompt_auxiliar
69
+ self.gpt_model = serializer.model
70
+ self.gpt_temperature = serializer.gpt_temperature
71
+ self.prompt_gerar_documento = serializer.prompt_gerar_documento
 
72
 
73
  self.openai_api_key = self.openai_api_key
74
  self.cohere_client = Client(self.cohere_api_key)
75
+ self.embeddings = HuggingFaceEmbeddings(model_name=serializer.hf_embedding)
76
+ self.num_k_rerank = serializer.num_k_rerank
77
+ self.model_cohere_rerank = serializer.model_cohere_rerank
78
+ self.splitter = Splitter(serializer.chunk_size, serializer.chunk_overlap)
79
 
80
+ self.vector_store = VectorStore(serializer.hf_embedding)
81
 
82
  def retrieve_with_rank_fusion(
83
  self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
 
120
  result_lists = [embedding_list, bm25_list]
121
  weights = [self.config.embedding_weight, self.config.bm25_weight]
122
 
123
+ combined_results = reciprocal_rank_fusion(result_lists, weights=weights)
 
 
124
 
125
+ return combined_results # type: ignore
126
 
127
  except Exception as e:
128
  self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
 
206
  # self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
207
 
208
  prompt_gerar_documento = PromptTemplate(
209
+ template=cast(str, self.prompt_gerar_documento),
210
  input_variables=["context"],
211
  )
212
 
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED
@@ -4,7 +4,7 @@ from _utils.gerar_relatorio_modelo_usuario.utils import (
4
  get_response_from_auxiliar_contextual_prompt,
5
  validate_many_chunks_in_one_request,
6
  )
7
- from typing import Any, List, Dict, Tuple, Optional, cast
8
  from anthropic import Anthropic, AsyncAnthropic
9
  import logging
10
  from langchain.schema import Document
@@ -13,7 +13,11 @@ import asyncio
13
  from typing import List
14
  from dataclasses import dataclass
15
 
16
- from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agemini_answer, agpt_answer
 
 
 
 
17
  from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
18
  from _utils.models.gerar_relatorio import (
19
  ContextualizedChunk,
@@ -22,16 +26,32 @@ from _utils.models.gerar_relatorio import (
22
  )
23
  from langchain_core.messages import HumanMessage
24
 
 
 
 
 
 
25
  lista_contador = []
26
 
27
 
28
  class ContextualRetriever:
29
 
30
- def __init__(self, config: RetrievalConfig, claude_context_model: str):
31
- self.config = config
 
 
 
 
 
 
 
 
 
 
 
32
  self.logger = logging.getLogger(__name__)
33
  self.bm25 = None
34
- self.claude_context_model = claude_context_model
35
 
36
  self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
37
  self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
@@ -81,15 +101,15 @@ class ContextualRetriever:
81
 
82
  for attempt in range(4):
83
  if attempt != 0:
84
- print("------------- FORMATAÇÃO DO CONTEXTUAL INCORRETA - TENTANDO NOVAMENTE -------------")
85
- print(
86
- f"TENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt + 1}"
87
- )
88
  print("COMEÇANDO UMA REQUISIÇÃO DO CONTEXTUAL")
89
  # raw_response = await agpt_answer(prompt)
90
  # raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite-preview-02-05")
91
  raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
92
-
93
  print("TERMINOU UMA REQUISIÇÃO DO CONTEXTUAL")
94
  response = cast(str, raw_response)
95
  # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
@@ -131,10 +151,10 @@ class ContextualRetriever:
131
  context=result[index][1],
132
  )
133
  )
134
- except BaseException as e :
135
  print(e)
136
  print("\nERRO DO CONTEXTUAL")
137
- print('\n\nresult', result)
138
 
139
  return lista_chunks
140
 
 
4
  get_response_from_auxiliar_contextual_prompt,
5
  validate_many_chunks_in_one_request,
6
  )
7
+ from typing import Any, List, Dict, Tuple, Optional, Union, cast
8
  from anthropic import Anthropic, AsyncAnthropic
9
  import logging
10
  from langchain.schema import Document
 
13
  from typing import List
14
  from dataclasses import dataclass
15
 
16
+ from _utils.gerar_relatorio_modelo_usuario.llm_calls import (
17
+ aclaude_answer,
18
+ agemini_answer,
19
+ agpt_answer,
20
+ )
21
  from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
22
  from _utils.models.gerar_relatorio import (
23
  ContextualizedChunk,
 
26
  )
27
  from langchain_core.messages import HumanMessage
28
 
29
+ from gerar_documento.serializer import (
30
+ GerarDocumentoComPDFProprioSerializerData,
31
+ GerarDocumentoSerializerData,
32
+ )
33
+
34
  lista_contador = []
35
 
36
 
37
  class ContextualRetriever:
38
 
39
+ def __init__(
40
+ self,
41
+ serializer: Union[
42
+ GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
43
+ ],
44
+ ):
45
+ self.config = RetrievalConfig(
46
+ num_chunks=serializer.num_chunks_retrieval,
47
+ embedding_weight=serializer.embedding_weight,
48
+ bm25_weight=serializer.bm25_weight,
49
+ context_window=serializer.context_window,
50
+ chunk_overlap=serializer.chunk_overlap,
51
+ )
52
  self.logger = logging.getLogger(__name__)
53
  self.bm25 = None
54
+ self.claude_context_model = serializer.claude_context_model
55
 
56
  self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
57
  self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
 
101
 
102
  for attempt in range(4):
103
  if attempt != 0:
104
+ print(
105
+ "------------- FORMATAÇÃO DO CONTEXTUAL INCORRETA - TENTANDO NOVAMENTE -------------"
106
+ )
107
+ print(f"TENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt + 1}")
108
  print("COMEÇANDO UMA REQUISIÇÃO DO CONTEXTUAL")
109
  # raw_response = await agpt_answer(prompt)
110
  # raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite-preview-02-05")
111
  raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
112
+
113
  print("TERMINOU UMA REQUISIÇÃO DO CONTEXTUAL")
114
  response = cast(str, raw_response)
115
  # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
 
151
  context=result[index][1],
152
  )
153
  )
154
+ except BaseException as e:
155
  print(e)
156
  print("\nERRO DO CONTEXTUAL")
157
+ print("\n\nresult", result)
158
 
159
  return lista_chunks
160
 
gerar_documento/serializer.py CHANGED
@@ -157,7 +157,7 @@ class GerarDocumentoComPDFProprioSerializer(GerarDocumentoInitialSerializer):
157
 
158
 
159
  @dataclass
160
- class GerarDocumentoComPDFProprioData(GerarDocumentoInitialSerializerData):
161
  prompt_gerar_documento: Optional[str] = field(default=None)
162
  user_message: Optional[str] = field(default=None)
163
  num_chunks_retrieval: int = field(default=20)
 
157
 
158
 
159
  @dataclass
160
+ class GerarDocumentoComPDFProprioSerializerData(GerarDocumentoInitialSerializerData):
161
  prompt_gerar_documento: Optional[str] = field(default=None)
162
  user_message: Optional[str] = field(default=None)
163
  num_chunks_retrieval: int = field(default=20)