luanpoppe
feat: melhorando a instanciação de algumas classes de gerar documentos
a1f037d
from dataclasses import dataclass, field
from typing import List, Optional
from rest_framework import serializers
from _utils.gerar_relatorio_modelo_usuario.prompts import (
prompt_gerar_documento,
prompt_auxiliar_padrao,
)
from setup.environment import default_model
from django.core.files.uploadedfile import UploadedFile
user_message = "What are the main points of this document?"
prompt_template = """
Based on the following context, provide multiple key points from the document.
For each point, create a new paragraph.
Each paragraph should be a complete, self-contained insight.
Context: {context}
Key points:
"""
class GerarDocumentoInitialSerializer(serializers.Serializer):
files = serializers.ListField(child=serializers.FileField(), required=True)
system_prompt = serializers.CharField(required=False, default=prompt_template)
user_message = serializers.CharField(required=False, default="")
model = serializers.CharField(required=False, default=default_model)
hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
chunk_size = serializers.IntegerField(required=False, default=3500)
chunk_overlap = serializers.IntegerField(required=False, default=800)
@dataclass
class GerarDocumentoInitialSerializerData:
files: List[dict]
system_prompt: str = prompt_template
user_message: str = ""
model: str = default_model
hf_embedding: str = "all-MiniLM-L6-v2"
chunk_size: int = 3500
chunk_overlap: int = 800
class FileInfoSerializer(serializers.Serializer):
unique_id = serializers.CharField(max_length=255)
tipo_arquivo = serializers.CharField(max_length=255)
link_arquivo = serializers.URLField()
@dataclass
class FileInfoSerializerData:
unique_id: str
tipo_arquivo: str
link_arquivo: str
class GerarDocumentoSerializer(GerarDocumentoInitialSerializer):
system_prompt = None
files = serializers.ListField(child=FileInfoSerializer(), required=True)
bubble_editor_version = serializers.CharField(
required=False, default="version-test"
) # Será o valor utilizado dentro da URL da requisição pro Bubble
# prompt_auxiliar = serializers.CharField(
# required=False, default=prompt_auxiliar_padrao
# )
prompt_gerar_documento = serializers.CharField(
required=False, default=prompt_gerar_documento
)
user_message = serializers.CharField(required=False, default=user_message)
num_chunks_retrieval = serializers.IntegerField(default=20)
embedding_weight = serializers.FloatField(default=0.5)
bm25_weight = serializers.FloatField(default=0.5)
context_window = serializers.IntegerField(default=3)
chunk_overlap = serializers.IntegerField(default=800)
num_k_rerank = serializers.IntegerField(default=20)
model_cohere_rerank = serializers.CharField(
required=False, default="rerank-english-v2.0"
)
more_initial_chunks_for_reranking = serializers.IntegerField(default=100)
claude_context_model = serializers.CharField(
required=False, default="claude-3-haiku-20240307"
)
gpt_temperature = serializers.FloatField(default=0)
id_modelo_do_usuario = serializers.IntegerField(required=False)
should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
should_use_llama_parse = serializers.BooleanField(required=False, default=False) # type: ignore
llm_ultimas_requests = serializers.CharField(
required=False, default="gemini-2.0-flash"
)
doc_id = serializers.CharField(required=True)
form_response_id = serializers.CharField(required=True)
version = serializers.CharField(required=True)
def get_obj(self):
return GerarDocumentoSerializerData(**self.validated_data) # type: ignore
@dataclass
class GerarDocumentoSerializerData(GerarDocumentoInitialSerializerData):
files: List[FileInfoSerializerData]
bubble_editor_version: str = "version-test"
prompt_gerar_documento: str = ""
user_message: str = ""
num_chunks_retrieval: int = 20
embedding_weight: float = 0.5
bm25_weight: float = 0.5
context_window: int = 3
chunk_overlap: int = 800
num_k_rerank: int = 20
model_cohere_rerank: str = "rerank-english-v2.0"
more_initial_chunks_for_reranking: int = 100
claude_context_model: str = "claude-3-haiku-20240307"
gpt_temperature: float = 0.0
id_modelo_do_usuario: Optional[int] = None
should_have_contextual_chunks: bool = False
should_use_llama_parse: bool = False
llm_ultimas_requests: str = "gemini-2.0-flash"
doc_id: str = ""
form_response_id: str = ""
version: str = ""
class GerarDocumentoComPDFProprioSerializer(GerarDocumentoInitialSerializer):
system_prompt = None
# prompt_auxiliar = serializers.CharField(
# required=False, default=prompt_auxiliar_padrao
# )
prompt_gerar_documento = serializers.CharField(
required=False, default=prompt_gerar_documento
)
user_message = serializers.CharField(required=False, default=user_message)
num_chunks_retrieval = serializers.IntegerField(default=20)
embedding_weight = serializers.FloatField(default=0.5)
bm25_weight = serializers.FloatField(default=0.5)
context_window = serializers.IntegerField(default=3)
chunk_overlap = serializers.IntegerField(default=800)
num_k_rerank = serializers.IntegerField(default=20)
model_cohere_rerank = serializers.CharField(
required=False, default="rerank-english-v2.0"
)
more_initial_chunks_for_reranking = serializers.IntegerField(default=100)
claude_context_model = serializers.CharField(
required=False, default="claude-3-haiku-20240307"
)
gpt_temperature = serializers.FloatField(default=0)
id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
should_have_contextual_chunks = serializers.BooleanField(default=False) # type: ignore
should_use_llama_parse = serializers.BooleanField(required=False, default=False) # type: ignore
llm_ultimas_requests = serializers.CharField(required=False, default="gpt-4o-mini")
def get_obj(self):
return GerarDocumentoSerializerData(**self.validated_data) # type: ignore
@dataclass
class GerarDocumentoComPDFProprioSerializerData(GerarDocumentoInitialSerializerData):
prompt_gerar_documento: Optional[str] = field(default=None)
user_message: Optional[str] = field(default=None)
num_chunks_retrieval: int = field(default=20)
embedding_weight: float = field(default=0.5)
bm25_weight: float = field(default=0.5)
context_window: int = field(default=3)
chunk_overlap: int = field(default=800)
num_k_rerank: int = field(default=20)
model_cohere_rerank: str = field(default="rerank-english-v2.0")
more_initial_chunks_for_reranking: int = field(default=100)
claude_context_model: str = field(default="claude-3-haiku-20240307")
gpt_temperature: float = field(default=0.0)
id_modelo_do_usuario: int = field(default=11)
should_have_contextual_chunks: bool = field(default=False)
should_use_llama_parse: bool = field(default=False)
llm_ultimas_requests: str = field(default="gpt-4o-mini")
class GerarEmentaSerializer(serializers.Serializer):
files = serializers.ListField(child=FileInfoSerializer(), required=True)
user_message = serializers.CharField(required=False, default="")
chunk_size = serializers.IntegerField(required=False, default=3500)
chunk_overlap = serializers.IntegerField(required=False, default=800)
bubble_editor_version = serializers.CharField(
required=False, default="version-test"
) # Será o valor utilizado dentro da URL da requisição pro Bubble
doc_id = serializers.CharField(required=True)
form_response_id = serializers.CharField(required=True)
version = serializers.CharField(required=True)