Spaces:

luanpoppe
/

vella-backend

Running

App Files Files Community

luanpoppe commited on Apr 3

Commit

a1f037d

1 Parent(s): 756fca0

feat: melhorando a instanciação de algumas classes de gerar documentos

Browse files

Files changed (4) hide show

_utils/gerar_documento.py +15 -49
_utils/gerar_relatorio_modelo_usuario/GerarDocumento.py +45 -28
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py +32 -12
gerar_documento/serializer.py +1 -1

_utils/gerar_documento.py CHANGED Viewed

@@ -26,26 +26,11 @@ import markdown
 from _utils.langchain_utils.Prompt_class import Prompt
 from _utils.utils import convert_markdown_to_HTML
-from gerar_documento.serializer import GerarDocumentoSerializerData
-def reciprocal_rank_fusion(result_lists, weights=None):
-    """Combine multiple ranked lists using reciprocal rank fusion"""
-    fused_scores = {}
-    num_lists = len(result_lists)
-    if weights is None:
-        weights = [1.0] * num_lists
-    for i in range(num_lists):
-        for doc_id, score in result_lists[i]:
-            if doc_id not in fused_scores:
-                fused_scores[doc_id] = 0
-            fused_scores[doc_id] += weights[i] * score
-    # Sort by score in descending order
-    sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
-    return sorted_results
 os.environ["LANGCHAIN_TRACING_V2"] = "true"
@@ -55,37 +40,18 @@ os.environ["LANGCHAIN_PROJECT"] = "VELLA"
 async def gerar_documento(
-    serializer: Union[GerarDocumentoSerializerData, Any], listaPDFs, isBubble=False
 ):
     """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
     try:
-        # Configuration
-        config = RetrievalConfig(
-            num_chunks=serializer.num_chunks_retrieval,
-            embedding_weight=serializer.embedding_weight,
-            bm25_weight=serializer.bm25_weight,
-            context_window=serializer.context_window,
-            chunk_overlap=serializer.chunk_overlap,
-        )
-        contextual_retriever = ContextualRetriever(
-            config, serializer.claude_context_model
-        )
         # Initialize enhanced summarizer
-        summarizer = GerarDocumento(
-            config=config,
-            embedding_model=serializer.hf_embedding,
-            chunk_overlap=serializer.chunk_overlap,
-            chunk_size=serializer.chunk_size,
-            num_k_rerank=serializer.num_k_rerank,
-            model_cohere_rerank=serializer.model_cohere_rerank,
-            # prompt_auxiliar=serializer.prompt_auxiliar,
-            gpt_model=serializer.model,
-            gpt_temperature=serializer.gpt_temperature,
-            prompt_gerar_documento=serializer.prompt_gerar_documento,
-            reciprocal_rank_fusion=reciprocal_rank_fusion,
-        )
         all_PDFs_chunks, full_text_as_array, full_text_as_string = (
             await get_full_text_and_all_PDFs_chunks(
@@ -173,9 +139,9 @@ async def gerar_documento(
         if isBubble:
             print("COMEÇANDO A REQUISIÇÃO FINAL PARA O BUBBLE")
             enviar_resposta_final(
-                serializer.doc_id,
-                serializer.form_response_id,
-                serializer.version,
                 texto_completo_como_html,
                 False,
                 cast(str, titulo_do_documento),

 from _utils.langchain_utils.Prompt_class import Prompt
 from _utils.utils import convert_markdown_to_HTML
+from gerar_documento.serializer import (
+    GerarDocumentoComPDFProprioSerializer,
+    GerarDocumentoComPDFProprioSerializerData,
+    GerarDocumentoSerializerData,
+)
 os.environ["LANGCHAIN_TRACING_V2"] = "true"
 async def gerar_documento(
+    serializer: Union[
+        GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
+    ],
+    listaPDFs,
+    isBubble=False,
 ):
     """Parâmetro "contexto" só deve ser passado quando quiser utilizar o teste com ragas, e assim, não quiser passar PDFs"""
     try:
+        contextual_retriever = ContextualRetriever(serializer)
         # Initialize enhanced summarizer
+        summarizer = GerarDocumento(serializer)
         all_PDFs_chunks, full_text_as_array, full_text_as_string = (
             await get_full_text_and_all_PDFs_chunks(
         if isBubble:
             print("COMEÇANDO A REQUISIÇÃO FINAL PARA O BUBBLE")
             enviar_resposta_final(
+                serializer.doc_id,  # type: ignore
+                serializer.form_response_id,  # type: ignore
+                serializer.version,  # type: ignore
                 texto_completo_como_html,
                 False,
                 cast(str, titulo_do_documento),

_utils/gerar_relatorio_modelo_usuario/GerarDocumento.py CHANGED Viewed

@@ -1,9 +1,13 @@
 import os
-from typing import List, Dict, Tuple, Optional, cast
 from pydantic import SecretStr
 from _utils.langchain_utils.LLM_class import LLM
 from _utils.langchain_utils.Vector_store_class import VectorStore
 from setup.easy_imports import (
     Chroma,
     ChatOpenAI,
@@ -23,6 +27,25 @@ from cohere import Client
 from _utils.langchain_utils.Splitter_class import Splitter
 class GerarDocumento:
     openai_api_key = os.environ.get("OPENAI_API_KEY", "")
     cohere_api_key = os.environ.get("COHERE_API_KEY", "")
@@ -30,35 +53,31 @@ class GerarDocumento:
     def __init__(
         self,
-        config: RetrievalConfig,
-        embedding_model,
-        chunk_size,
-        chunk_overlap,
-        num_k_rerank,
-        model_cohere_rerank,
-        # prompt_auxiliar,
-        gpt_model,
-        gpt_temperature,
-        # id_modelo_do_usuario,
-        prompt_gerar_documento,
-        reciprocal_rank_fusion,
     ):
-        self.config = config
         self.logger = logging.getLogger(__name__)
         # self.prompt_auxiliar = prompt_auxiliar
-        self.gpt_model = gpt_model
-        self.gpt_temperature = gpt_temperature
-        self.prompt_gerar_documento = prompt_gerar_documento
-        self.reciprocal_rank_fusion = reciprocal_rank_fusion
         self.openai_api_key = self.openai_api_key
         self.cohere_client = Client(self.cohere_api_key)
-        self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
-        self.num_k_rerank = num_k_rerank
-        self.model_cohere_rerank = model_cohere_rerank
-        self.splitter = Splitter(chunk_size, chunk_overlap)
-        self.vector_store = VectorStore(embedding_model)
     def retrieve_with_rank_fusion(
         self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
@@ -101,11 +120,9 @@ class GerarDocumento:
             result_lists = [embedding_list, bm25_list]
             weights = [self.config.embedding_weight, self.config.bm25_weight]
-            combined_results = self.reciprocal_rank_fusion(
-                result_lists, weights=weights
-            )
-            return combined_results
         except Exception as e:
             self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
@@ -189,7 +206,7 @@ class GerarDocumento:
             # self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
             prompt_gerar_documento = PromptTemplate(
-                template=self.prompt_gerar_documento,
                 input_variables=["context"],
             )

 import os
+from typing import Any, List, Dict, Tuple, Optional, Union, cast
 from pydantic import SecretStr
 from _utils.langchain_utils.LLM_class import LLM
 from _utils.langchain_utils.Vector_store_class import VectorStore
+from gerar_documento.serializer import (
+    GerarDocumentoComPDFProprioSerializerData,
+    GerarDocumentoSerializerData,
+)
 from setup.easy_imports import (
     Chroma,
     ChatOpenAI,
 from _utils.langchain_utils.Splitter_class import Splitter
+def reciprocal_rank_fusion(result_lists, weights=None):
+    """Combine multiple ranked lists using reciprocal rank fusion"""
+    fused_scores = {}
+    num_lists = len(result_lists)
+    if weights is None:
+        weights = [1.0] * num_lists
+    for i in range(num_lists):
+        for doc_id, score in result_lists[i]:
+            if doc_id not in fused_scores:
+                fused_scores[doc_id] = 0
+            fused_scores[doc_id] += weights[i] * score
+    # Sort by score in descending order
+    sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
+    return sorted_results
 class GerarDocumento:
     openai_api_key = os.environ.get("OPENAI_API_KEY", "")
     cohere_api_key = os.environ.get("COHERE_API_KEY", "")
     def __init__(
         self,
+        serializer: Union[
+            GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
+        ],
     ):
+        self.config = RetrievalConfig(
+            num_chunks=serializer.num_chunks_retrieval,
+            embedding_weight=serializer.embedding_weight,
+            bm25_weight=serializer.bm25_weight,
+            context_window=serializer.context_window,
+            chunk_overlap=serializer.chunk_overlap,
+        )
         self.logger = logging.getLogger(__name__)
         # self.prompt_auxiliar = prompt_auxiliar
+        self.gpt_model = serializer.model
+        self.gpt_temperature = serializer.gpt_temperature
+        self.prompt_gerar_documento = serializer.prompt_gerar_documento
         self.openai_api_key = self.openai_api_key
         self.cohere_client = Client(self.cohere_api_key)
+        self.embeddings = HuggingFaceEmbeddings(model_name=serializer.hf_embedding)
+        self.num_k_rerank = serializer.num_k_rerank
+        self.model_cohere_rerank = serializer.model_cohere_rerank
+        self.splitter = Splitter(serializer.chunk_size, serializer.chunk_overlap)
+        self.vector_store = VectorStore(serializer.hf_embedding)
     def retrieve_with_rank_fusion(
         self, vector_store: Chroma, bm25: BM25Okapi, chunk_ids: List[str], query: str
             result_lists = [embedding_list, bm25_list]
             weights = [self.config.embedding_weight, self.config.bm25_weight]
+            combined_results = reciprocal_rank_fusion(result_lists, weights=weights)
+            return combined_results  # type: ignore
         except Exception as e:
             self.logger.error(f"Error in rank fusion retrieval: {str(e)}")
             # self.resumo_gerado = cast(str, resumo_auxiliar_do_documento.content)
             prompt_gerar_documento = PromptTemplate(
+                template=cast(str, self.prompt_gerar_documento),
                 input_variables=["context"],
             )

_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py CHANGED Viewed

@@ -4,7 +4,7 @@ from _utils.gerar_relatorio_modelo_usuario.utils import (
     get_response_from_auxiliar_contextual_prompt,
     validate_many_chunks_in_one_request,
 )
-from typing import Any, List, Dict, Tuple, Optional, cast
 from anthropic import Anthropic, AsyncAnthropic
 import logging
 from langchain.schema import Document
@@ -13,7 +13,11 @@ import asyncio
 from typing import List
 from dataclasses import dataclass
-from _utils.gerar_relatorio_modelo_usuario.llm_calls import aclaude_answer, agemini_answer, agpt_answer
 from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
 from _utils.models.gerar_relatorio import (
     ContextualizedChunk,
@@ -22,16 +26,32 @@ from _utils.models.gerar_relatorio import (
 )
 from langchain_core.messages import HumanMessage
 lista_contador = []
 class ContextualRetriever:
-    def __init__(self, config: RetrievalConfig, claude_context_model: str):
-        self.config = config
         self.logger = logging.getLogger(__name__)
         self.bm25 = None
-        self.claude_context_model = claude_context_model
         self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
         self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
@@ -81,15 +101,15 @@ class ContextualRetriever:
             for attempt in range(4):
                 if attempt != 0:
-                    print("------------- FORMATAÇÃO DO CONTEXTUAL INCORRETA - TENTANDO NOVAMENTE -------------")
-                print(
-                    f"TENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt + 1}"
-                )
                 print("COMEÇANDO UMA REQUISIÇÃO DO CONTEXTUAL")
                 # raw_response = await agpt_answer(prompt)
                 # raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite-preview-02-05")
                 raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
                 print("TERMINOU UMA REQUISIÇÃO DO CONTEXTUAL")
                 response = cast(str, raw_response)
                 # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
@@ -131,10 +151,10 @@ class ContextualRetriever:
                         context=result[index][1],
                     )
                 )
-        except BaseException as e :
             print(e)
             print("\nERRO DO CONTEXTUAL")
-            print('\n\nresult', result)
         return lista_chunks

     get_response_from_auxiliar_contextual_prompt,
     validate_many_chunks_in_one_request,
 )
+from typing import Any, List, Dict, Tuple, Optional, Union, cast
 from anthropic import Anthropic, AsyncAnthropic
 import logging
 from langchain.schema import Document
 from typing import List
 from dataclasses import dataclass
+from _utils.gerar_relatorio_modelo_usuario.llm_calls import (
+    aclaude_answer,
+    agemini_answer,
+    agpt_answer,
+)
 from _utils.gerar_relatorio_modelo_usuario.prompts import contextual_prompt
 from _utils.models.gerar_relatorio import (
     ContextualizedChunk,
 )
 from langchain_core.messages import HumanMessage
+from gerar_documento.serializer import (
+    GerarDocumentoComPDFProprioSerializerData,
+    GerarDocumentoSerializerData,
+)
 lista_contador = []
 class ContextualRetriever:
+    def __init__(
+        self,
+        serializer: Union[
+            GerarDocumentoSerializerData, GerarDocumentoComPDFProprioSerializerData, Any
+        ],
+    ):
+        self.config = RetrievalConfig(
+            num_chunks=serializer.num_chunks_retrieval,
+            embedding_weight=serializer.embedding_weight,
+            bm25_weight=serializer.bm25_weight,
+            context_window=serializer.context_window,
+            chunk_overlap=serializer.chunk_overlap,
+        )
         self.logger = logging.getLogger(__name__)
         self.bm25 = None
+        self.claude_context_model = serializer.claude_context_model
         self.claude_api_key = os.environ.get("CLAUDE_API_KEY", "")
         self.claude_client = AsyncAnthropic(api_key=self.claude_api_key)
             for attempt in range(4):
                 if attempt != 0:
+                    print(
+                        "------------- FORMATAÇÃO DO CONTEXTUAL INCORRETA - TENTANDO NOVAMENTE -------------"
+                    )
+                print(f"TENTATIVA FORMATAÇÃO CHUNKS NÚMERO {attempt + 1}")
                 print("COMEÇANDO UMA REQUISIÇÃO DO CONTEXTUAL")
                 # raw_response = await agpt_answer(prompt)
                 # raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite-preview-02-05")
                 raw_response = await agemini_answer(prompt, "gemini-2.0-flash-lite")
                 print("TERMINOU UMA REQUISIÇÃO DO CONTEXTUAL")
                 response = cast(str, raw_response)
                 # response = await llms.deepseek().ainvoke([HumanMessage(content=prompt)])
                         context=result[index][1],
                     )
                 )
+        except BaseException as e:
             print(e)
             print("\nERRO DO CONTEXTUAL")
+            print("\n\nresult", result)
         return lista_chunks

gerar_documento/serializer.py CHANGED Viewed

@@ -157,7 +157,7 @@ class GerarDocumentoComPDFProprioSerializer(GerarDocumentoInitialSerializer):
 @dataclass
-class GerarDocumentoComPDFProprioData(GerarDocumentoInitialSerializerData):
     prompt_gerar_documento: Optional[str] = field(default=None)
     user_message: Optional[str] = field(default=None)
     num_chunks_retrieval: int = field(default=20)

 @dataclass
+class GerarDocumentoComPDFProprioSerializerData(GerarDocumentoInitialSerializerData):
     prompt_gerar_documento: Optional[str] = field(default=None)
     user_message: Optional[str] = field(default=None)
     num_chunks_retrieval: int = field(default=20)