Spaces:
Running
Running
luanpoppe
commited on
Commit
·
a1f037d
1
Parent(s):
756fca0
feat: melhorando a instanciação de algumas classes de gerar documentos
Browse files
_utils/gerar_documento.py
CHANGED
@@ -26,26 +26,11 @@ import markdown
|
|
26 |
|
27 |
from _utils.langchain_utils.Prompt_class import Prompt
|
28 |
from _utils.utils import convert_markdown_to_HTML
|
29 |
-
from gerar_documento.serializer import
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
fused_scores = {}
|
35 |
-
num_lists = len(result_lists)
|
36 |
-
if weights is None:
|
37 |
-
weights = [1.0] * num_lists
|
38 |
-
|
39 |
-
for i in range(num_lists):
|
40 |
-
for doc_id, score in result_lists[i]:
|
41 |
-
if doc_id not in fused_scores:
|
42 |
-
fused_scores[doc_id] = 0
|
43 |
-
fused_scores[doc_id] += weights[i] * score
|
44 |
-
|
45 |
-
# Sort by score in descending order
|
46 |
-
sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
|
47 |
-
|
48 |
-
return sorted_results
|
49 |
|
50 |
|
51 |
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
@@ -55,37 +40,18 @@ os.environ["LANGCHAIN_PROJECT"] = "VELLA"
|
|
55 |
|
56 |
|
57 |
async def gerar_documento(
|
58 |
-
serializer: Union[
|
|
|
|
|
|
|
|
|
59 |
):
|
60 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
61 |
try:
|
62 |
-
|
63 |
-
config = RetrievalConfig(
|
64 |
-
num_chunks=serializer.num_chunks_retrieval,
|
65 |
-
embedding_weight=serializer.embedding_weight,
|
66 |
-
bm25_weight=serializer.bm25_weight,
|
67 |
-
context_window=serializer.context_window,
|
68 |
-
chunk_overlap=serializer.chunk_overlap,
|
69 |
-
)
|
70 |
-
|
71 |
-
contextual_retriever = ContextualRetriever(
|
72 |
-
config, serializer.claude_context_model
|
73 |
-
)
|
74 |
|
75 |
# Initialize enhanced summarizer
|
76 |
-
summarizer = GerarDocumento(
|
77 |
-
config=config,
|
78 |
-
embedding_model=serializer.hf_embedding,
|
79 |
-
chunk_overlap=serializer.chunk_overlap,
|
80 |
-
chunk_size=serializer.chunk_size,
|
81 |
-
num_k_rerank=serializer.num_k_rerank,
|
82 |
-
model_cohere_rerank=serializer.model_cohere_rerank,
|
83 |
-
# prompt_auxiliar=serializer.prompt_auxiliar,
|
84 |
-
gpt_model=serializer.model,
|
85 |
-
gpt_temperature=serializer.gpt_temperature,
|
86 |
-
prompt_gerar_documento=serializer.prompt_gerar_documento,
|
87 |
-
reciprocal_rank_fusion=reciprocal_rank_fusion,
|
88 |
-
)
|
89 |
|
90 |
all_PDFs_chunks, full_text_as_array, full_text_as_string = (
|
91 |
await get_full_text_and_all_PDFs_chunks(
|
@@ -173,9 +139,9 @@ async def gerar_documento(
|
|
173 |
if isBubble:
|
174 |
print("COMEÇANDO A REQUISIÇÃO FINAL PARA O BUBBLE")
|
175 |
enviar_resposta_final(
|
176 |
-
serializer.doc_id,
|
177 |
-
serializer.form_response_id,
|
178 |
-
serializer.version,
|
179 |
texto_completo_como_html,
|
180 |
False,
|
181 |
cast(str, titulo_do_documento),
|
|
|
26 |
|
27 |
from _utils.langchain_utils.Prompt_class import Prompt
|
28 |
from _utils.utils import convert_markdown_to_HTML
|
29 |
+
from gerar_documento.serializer import (
|
30 |
+
GerarDocumentoComPDFProprioSerializer,
|
31 |
+
GerarDocumentoComPDFProprioSerializerData,
|
32 |
+
GerarDocumentoSerializerData,
|
33 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
os.environ["LANGCHAIN_TRACING_V2"] = "true"
|
|
|
40 |
|
41 |
|
42 |
async def gerar_documento(
|
43 |
+
serializer: Union[
|
44 |
+
GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
|
45 |
+
],
|
46 |
+
listaPDFs,
|
47 |
+
isBubble=False,
|
48 |
):
|
49 |
"""Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
|
50 |
try:
|
51 |
+
contextual_retriever = ContextualRetriever(serializer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Initialize enhanced summarizer
|
54 |
+
summarizer = GerarDocumento(serializer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
all_PDFs_chunks, full_text_as_array, full_text_as_string = (
|
57 |
await get_full_text_and_all_PDFs_chunks(
|
|
|
139 |
if isBubble:
|
140 |
print("COMEÇANDO A REQUISIÇÃO FINAL PARA O BUBBLE")
|
141 |
enviar_resposta_final(
|
142 |
+
serializer.doc_id, # type: ignore
|
143 |
+
serializer.form_response_id, # type: ignore
|
144 |
+
serializer.version, # type: ignore
|
145 |
texto_completo_como_html,
|
146 |
False,
|
147 |
cast(str, titulo_do_documento),
|
_utils/gerar_relatorio_modelo_usuario/GerarDocumento.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1 |
import os
|
2 |
-
from typing import List, Dict, Tuple, Optional, cast
|
3 |
|
4 |
from pydantic import SecretStr
|
5 |
from _utils.langchain_utils.LLM_class import LLM
|
6 |
from _utils.langchain_utils.Vector_store_class import VectorStore
|
|
|
|
|
|
|
|
|
7 |
from setup.easy_imports import (
|
8 |
Chroma,
|
9 |
ChatOpenAI,
|
@@ -23,6 +27,25 @@ from cohere import Client
|
|
23 |
from _utils.langchain_utils.Splitter_class import Splitter
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
class GerarDocumento:
|
27 |
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
28 |
cohere_api_key = os.environ.get("COHERE_API_KEY", "")
|
@@ -30,35 +53,31 @@ class GerarDocumento:
|
|
30 |
|
31 |
def __init__(
|
32 |
self,
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
chunk_overlap,
|
37 |
-
num_k_rerank,
|
38 |
-
model_cohere_rerank,
|
39 |
-
# prompt_auxiliar,
|
40 |
-
gpt_model,
|
41 |
-
gpt_temperature,
|
42 |
-
# id_modelo_do_usuario,
|
43 |
-
prompt_gerar_documento,
|
44 |
-
reciprocal_rank_fusion,
|
45 |
):
|
46 |
-
self.config =
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
self.logger = logging.getLogger(__name__)
|
48 |
# self.prompt_auxiliar = prompt_auxiliar
|
49 |
-
self.gpt_model =
|
50 |
-
self.gpt_temperature = gpt_temperature
|
51 |
-
self.prompt_gerar_documento = prompt_gerar_documento
|
52 |
-
self.reciprocal_rank_fusion = reciprocal_rank_fusion
|
53 |
|
54 |
self.openai_api_key = self.openai_api_key
|
55 |
self.cohere_client = Client(self.cohere_api_key)
|
56 |
-
self.embeddings = HuggingFaceEmbeddings(model_name=
|
57 |
-
self.num_k_rerank = num_k_rerank
|
58 |
-
self.model_cohere_rerank = model_cohere_rerank
|
59 |
-
self.splitter = Splitter(chunk_size, chunk_overlap)
|
60 |
|
61 |
-
self.vector_store = VectorStore(
|
62 |
|
63 |
def retrieve_with_rank_fusion(
|
64 |
self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
|
@@ -101,11 +120,9 @@ class GerarDocumento:
|
|
101 |
result_lists = [embedding_list, bm25_list]
|
102 |
weights = [self.config.embedding_weight, self.config.bm25_weight]
|
103 |
|
104 |
-
combined_results =
|
105 |
-
result_lists, weights=weights
|
106 |
-
)
|
107 |
|
108 |
-
return combined_results
|
109 |
|
110 |
except Exception as e:
|
111 |
self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
|
@@ -189,7 +206,7 @@ class GerarDocumento:
|
|
189 |
# self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
|
190 |
|
191 |
prompt_gerar_documento = PromptTemplate(
|
192 |
-
template=self.prompt_gerar_documento,
|
193 |
input_variables=["context"],
|
194 |
)
|
195 |
|
|
|
1 |
import os
|
2 |
+
from typing import Any, List, Dict, Tuple, Optional, Union, cast
|
3 |
|
4 |
from pydantic import SecretStr
|
5 |
from _utils.langchain_utils.LLM_class import LLM
|
6 |
from _utils.langchain_utils.Vector_store_class import VectorStore
|
7 |
+
from gerar_documento.serializer import (
|
8 |
+
GerarDocumentoComPDFProprioSerializerData,
|
9 |
+
GerarDocumentoSerializerData,
|
10 |
+
)
|
11 |
from setup.easy_imports import (
|
12 |
Chroma,
|
13 |
ChatOpenAI,
|
|
|
27 |
from _utils.langchain_utils.Splitter_class import Splitter
|
28 |
|
29 |
|
30 |
+
def reciprocal_rank_fusion(result_lists, weights=None):
|
31 |
+
"""Combine multiple ranked lists using reciprocal rank fusion"""
|
32 |
+
fused_scores = {}
|
33 |
+
num_lists = len(result_lists)
|
34 |
+
if weights is None:
|
35 |
+
weights = [1.0] * num_lists
|
36 |
+
|
37 |
+
for i in range(num_lists):
|
38 |
+
for doc_id, score in result_lists[i]:
|
39 |
+
if doc_id not in fused_scores:
|
40 |
+
fused_scores[doc_id] = 0
|
41 |
+
fused_scores[doc_id] += weights[i] * score
|
42 |
+
|
43 |
+
# Sort by score in descending order
|
44 |
+
sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
|
45 |
+
|
46 |
+
return sorted_results
|
47 |
+
|
48 |
+
|
49 |
class GerarDocumento:
|
50 |
openai_api_key = os.environ.get("OPENAI_API_KEY", "")
|
51 |
cohere_api_key = os.environ.get("COHERE_API_KEY", "")
|
|
|
53 |
|
54 |
def __init__(
|
55 |
self,
|
56 |
+
serializer: Union[
|
57 |
+
GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
|
58 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
):
|
60 |
+
self.config = RetrievalConfig(
|
61 |
+
num_chunks=serializer.num_chunks_retrieval,
|
62 |
+
embedding_weight=serializer.embedding_weight,
|
63 |
+
bm25_weight=serializer.bm25_weight,
|
64 |
+
context_window=serializer.context_window,
|
65 |
+
chunk_overlap=serializer.chunk_overlap,
|
66 |
+
)
|
67 |
self.logger = logging.getLogger(__name__)
|
68 |
# self.prompt_auxiliar = prompt_auxiliar
|
69 |
+
self.gpt_model = serializer.model
|
70 |
+
self.gpt_temperature = serializer.gpt_temperature
|
71 |
+
self.prompt_gerar_documento = serializer.prompt_gerar_documento
|
|
|
72 |
|
73 |
self.openai_api_key = self.openai_api_key
|
74 |
self.cohere_client = Client(self.cohere_api_key)
|
75 |
+
self.embeddings = HuggingFaceEmbeddings(model_name=serializer.hf_embedding)
|
76 |
+
self.num_k_rerank = serializer.num_k_rerank
|
77 |
+
self.model_cohere_rerank = serializer.model_cohere_rerank
|
78 |
+
self.splitter = Splitter(serializer.chunk_size, serializer.chunk_overlap)
|
79 |
|
80 |
+
self.vector_store = VectorStore(serializer.hf_embedding)
|
81 |
|
82 |
def retrieve_with_rank_fusion(
|
83 |
self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
|
|
|
120 |
result_lists = [embedding_list, bm25_list]
|
121 |
weights = [self.config.embedding_weight, self.config.bm25_weight]
|
122 |
|
123 |
+
combined_results = reciprocal_rank_fusion(result_lists, weights=weights)
|
|
|
|
|
124 |
|
125 |
+
return combined_results # type: ignore
|
126 |
|
127 |
except Exception as e:
|
128 |
self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
|
|
|
206 |
# self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
|
207 |
|
208 |
prompt_gerar_documento = PromptTemplate(
|
209 |
+
template=cast(str, self.prompt_gerar_documento),
|
210 |
input_variables=["context"],
|
211 |
)
|
212 |
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
@@ -4,7 +4,7 @@ from _utils.gerar_relatorio_modelo_usuario.utils import (
|
|
4 |
get_response_from_auxiliar_contextual_prompt,
|
5 |
validate_many_chunks_in_one_request,
|
6 |
)
|
7 |
-
from typing import Any, List, Dict, Tuple, Optional, cast
|
8 |
from anthropic import Anthropic, AsyncAnthropic
|
9 |
import logging
|
10 |
from langchain.schema import Document
|
@@ -13,7 +13,11 @@ import asyncio
|
|
13 |
from typing import List
|
14 |
from dataclasses import dataclass
|
15 |
|
16 |
-
from _utils.gerar_relatorio_modelo_usuario.llm_calls import
|
|
|
|
|
|
|
|
|
17 |
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
|
18 |
from _utils.models.gerar_relatorio import (
|
19 |
ContextualizedChunk,
|
@@ -22,16 +26,32 @@ from _utils.models.gerar_relatorio import (
|
|
22 |
)
|
23 |
from langchain_core.messages import HumanMessage
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
lista_contador = []
|
26 |
|
27 |
|
28 |
class ContextualRetriever:
|
29 |
|
30 |
-
def __init__(
|
31 |
-
self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
self.logger = logging.getLogger(__name__)
|
33 |
self.bm25 = None
|
34 |
-
self.claude_context_model = claude_context_model
|
35 |
|
36 |
self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
|
37 |
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
|
@@ -81,15 +101,15 @@ class ContextualRetriever:
|
|
81 |
|
82 |
for attempt in range(4):
|
83 |
if attempt != 0:
|
84 |
-
print(
|
85 |
-
|
86 |
-
|
87 |
-
)
|
88 |
print("COMEÇANDO UMA REQUISIÇÃO DO CONTEXTUAL")
|
89 |
# raw_response = await agpt_answer(prompt)
|
90 |
# raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite-preview-02-05")
|
91 |
raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
|
92 |
-
|
93 |
print("TERMINOU UMA REQUISIÇÃO DO CONTEXTUAL")
|
94 |
response = cast(str, raw_response)
|
95 |
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
|
@@ -131,10 +151,10 @@ class ContextualRetriever:
|
|
131 |
context=result[index][1],
|
132 |
)
|
133 |
)
|
134 |
-
except BaseException as e
|
135 |
print(e)
|
136 |
print("\nERRO DO CONTEXTUAL")
|
137 |
-
print(
|
138 |
|
139 |
return lista_chunks
|
140 |
|
|
|
4 |
get_response_from_auxiliar_contextual_prompt,
|
5 |
validate_many_chunks_in_one_request,
|
6 |
)
|
7 |
+
from typing import Any, List, Dict, Tuple, Optional, Union, cast
|
8 |
from anthropic import Anthropic, AsyncAnthropic
|
9 |
import logging
|
10 |
from langchain.schema import Document
|
|
|
13 |
from typing import List
|
14 |
from dataclasses import dataclass
|
15 |
|
16 |
+
from _utils.gerar_relatorio_modelo_usuario.llm_calls import (
|
17 |
+
aclaude_answer,
|
18 |
+
agemini_answer,
|
19 |
+
agpt_answer,
|
20 |
+
)
|
21 |
from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
|
22 |
from _utils.models.gerar_relatorio import (
|
23 |
ContextualizedChunk,
|
|
|
26 |
)
|
27 |
from langchain_core.messages import HumanMessage
|
28 |
|
29 |
+
from gerar_documento.serializer import (
|
30 |
+
GerarDocumentoComPDFProprioSerializerData,
|
31 |
+
GerarDocumentoSerializerData,
|
32 |
+
)
|
33 |
+
|
34 |
lista_contador = []
|
35 |
|
36 |
|
37 |
class ContextualRetriever:
|
38 |
|
39 |
+
def __init__(
|
40 |
+
self,
|
41 |
+
serializer: Union[
|
42 |
+
GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
|
43 |
+
],
|
44 |
+
):
|
45 |
+
self.config = RetrievalConfig(
|
46 |
+
num_chunks=serializer.num_chunks_retrieval,
|
47 |
+
embedding_weight=serializer.embedding_weight,
|
48 |
+
bm25_weight=serializer.bm25_weight,
|
49 |
+
context_window=serializer.context_window,
|
50 |
+
chunk_overlap=serializer.chunk_overlap,
|
51 |
+
)
|
52 |
self.logger = logging.getLogger(__name__)
|
53 |
self.bm25 = None
|
54 |
+
self.claude_context_model = serializer.claude_context_model
|
55 |
|
56 |
self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
|
57 |
self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
|
|
|
101 |
|
102 |
for attempt in range(4):
|
103 |
if attempt != 0:
|
104 |
+
print(
|
105 |
+
"------------- FORMATAÇÃO DO CONTEXTUAL INCORRETA - TENTANDO NOVAMENTE -------------"
|
106 |
+
)
|
107 |
+
print(f"TENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt + 1}")
|
108 |
print("COMEÇANDO UMA REQUISIÇÃO DO CONTEXTUAL")
|
109 |
# raw_response = await agpt_answer(prompt)
|
110 |
# raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite-preview-02-05")
|
111 |
raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
|
112 |
+
|
113 |
print("TERMINOU UMA REQUISIÇÃO DO CONTEXTUAL")
|
114 |
response = cast(str, raw_response)
|
115 |
# response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
|
|
|
151 |
context=result[index][1],
|
152 |
)
|
153 |
)
|
154 |
+
except BaseException as e:
|
155 |
print(e)
|
156 |
print("\nERRO DO CONTEXTUAL")
|
157 |
+
print("\n\nresult", result)
|
158 |
|
159 |
return lista_chunks
|
160 |
|
gerar_documento/serializer.py
CHANGED
@@ -157,7 +157,7 @@ class GerarDocumentoComPDFProprioSerializer(GerarDocumentoInitialSerializer):
|
|
157 |
|
158 |
|
159 |
@dataclass
|
160 |
-
class
|
161 |
prompt_gerar_documento: Optional[str] = field(default=None)
|
162 |
user_message: Optional[str] = field(default=None)
|
163 |
num_chunks_retrieval: int = field(default=20)
|
|
|
157 |
|
158 |
|
159 |
@dataclass
|
160 |
+
class GerarDocumentoComPDFProprioSerializerData(GerarDocumentoInitialSerializerData):
|
161 |
prompt_gerar_documento: Optional[str] = field(default=None)
|
162 |
user_message: Optional[str] = field(default=None)
|
163 |
num_chunks_retrieval: int = field(default=20)
|