File size: 7,950 Bytes
756fca0
 
1286e81
 
12d3e1a
cb23311
1286e81
a263183
756fca0
1286e81
 
 
 
bdf043b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756fca0
 
 
 
 
 
 
 
 
 
 
cb23311
 
 
 
 
 
756fca0
 
 
 
 
 
 
bdf043b
1286e81
cb23311
 
 
 
 
 
4a04d77
 
 
12d3e1a
 
1286e81
 
55f46c1
1286e81
 
 
78209bc
55f46c1
1286e81
 
 
55f46c1
1286e81
 
 
 
23087eb
b374298
8f3dc39
dc376b6
 
 
095b5f1
 
 
7eb86f7
756fca0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a263183
bdf043b
7eb86f7
4a04d77
 
 
7eb86f7
 
 
 
55f46c1
7eb86f7
 
 
78209bc
55f46c1
7eb86f7
 
 
55f46c1
7eb86f7
 
 
 
 
b374298
e70ffc1
3736ce1
a263183
756fca0
 
 
 
 
a1f037d
756fca0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a263183
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
from dataclasses import dataclass, field
from typing import List, Optional
from rest_framework import serializers
from _utils.gerar_relatorio_modelo_usuario.prompts import (
    prompt_gerar_documento,
    prompt_auxiliar_padrao,
)
from setup.environment import default_model
from django.core.files.uploadedfile import UploadedFile

user_message = "What are the main points of this document?"


prompt_template = """
    Based on the following context, provide multiple key points from the document.
    For each point, create a new paragraph.
    Each paragraph should be a complete, self-contained insight.
    
    Context: {context}
    
    Key points:
    """


class GerarDocumentoInitialSerializer(serializers.Serializer):
    files = serializers.ListField(child=serializers.FileField(), required=True)
    system_prompt = serializers.CharField(required=False, default=prompt_template)
    user_message = serializers.CharField(required=False, default="")
    model = serializers.CharField(required=False, default=default_model)
    hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
    chunk_size = serializers.IntegerField(required=False, default=3500)
    chunk_overlap = serializers.IntegerField(required=False, default=800)


@dataclass
class GerarDocumentoInitialSerializerData:
    files: List[dict]
    system_prompt: str = prompt_template
    user_message: str = ""
    model: str = default_model
    hf_embedding: str = "all-MiniLM-L6-v2"
    chunk_size: int = 3500
    chunk_overlap: int = 800


class FileInfoSerializer(serializers.Serializer):
    unique_id = serializers.CharField(max_length=255)
    tipo_arquivo = serializers.CharField(max_length=255)
    link_arquivo = serializers.URLField()


@dataclass
class FileInfoSerializerData:
    unique_id: str
    tipo_arquivo: str
    link_arquivo: str


class GerarDocumentoSerializer(GerarDocumentoInitialSerializer):
    system_prompt = None

    files = serializers.ListField(child=FileInfoSerializer(), required=True)
    bubble_editor_version = serializers.CharField(
        required=False, default="version-test"
    )  # Será o valor utilizado dentro da URL da requisição pro Bubble

    # prompt_auxiliar = serializers.CharField(
    #     required=False, default=prompt_auxiliar_padrao
    # )
    prompt_gerar_documento = serializers.CharField(
        required=False, default=prompt_gerar_documento
    )
    user_message = serializers.CharField(required=False, default=user_message)
    num_chunks_retrieval = serializers.IntegerField(default=20)
    embedding_weight = serializers.FloatField(default=0.5)
    bm25_weight = serializers.FloatField(default=0.5)
    context_window = serializers.IntegerField(default=3)
    chunk_overlap = serializers.IntegerField(default=800)
    num_k_rerank = serializers.IntegerField(default=20)
    model_cohere_rerank = serializers.CharField(
        required=False, default="rerank-english-v2.0"
    )
    more_initial_chunks_for_reranking = serializers.IntegerField(default=100)
    claude_context_model = serializers.CharField(
        required=False, default="claude-3-haiku-20240307"
    )
    gpt_temperature = serializers.FloatField(default=0)
    id_modelo_do_usuario = serializers.IntegerField(required=False)
    should_have_contextual_chunks = serializers.BooleanField(default=False)  # type: ignore
    should_use_llama_parse = serializers.BooleanField(required=False, default=False)  # type: ignore
    llm_ultimas_requests = serializers.CharField(
        required=False, default="gemini-2.0-flash"
    )
    doc_id = serializers.CharField(required=True)
    form_response_id = serializers.CharField(required=True)
    version = serializers.CharField(required=True)

    def get_obj(self):
        return GerarDocumentoSerializerData(**self.validated_data)  # type: ignore


@dataclass
class GerarDocumentoSerializerData(GerarDocumentoInitialSerializerData):
    files: List[FileInfoSerializerData]
    bubble_editor_version: str = "version-test"
    prompt_gerar_documento: str = ""
    user_message: str = ""
    num_chunks_retrieval: int = 20
    embedding_weight: float = 0.5
    bm25_weight: float = 0.5
    context_window: int = 3
    chunk_overlap: int = 800
    num_k_rerank: int = 20
    model_cohere_rerank: str = "rerank-english-v2.0"
    more_initial_chunks_for_reranking: int = 100
    claude_context_model: str = "claude-3-haiku-20240307"
    gpt_temperature: float = 0.0
    id_modelo_do_usuario: Optional[int] = None
    should_have_contextual_chunks: bool = False
    should_use_llama_parse: bool = False
    llm_ultimas_requests: str = "gemini-2.0-flash"
    doc_id: str = ""
    form_response_id: str = ""
    version: str = ""


class GerarDocumentoComPDFProprioSerializer(GerarDocumentoInitialSerializer):
    system_prompt = None
    # prompt_auxiliar = serializers.CharField(
    #     required=False, default=prompt_auxiliar_padrao
    # )
    prompt_gerar_documento = serializers.CharField(
        required=False, default=prompt_gerar_documento
    )
    user_message = serializers.CharField(required=False, default=user_message)
    num_chunks_retrieval = serializers.IntegerField(default=20)
    embedding_weight = serializers.FloatField(default=0.5)
    bm25_weight = serializers.FloatField(default=0.5)
    context_window = serializers.IntegerField(default=3)
    chunk_overlap = serializers.IntegerField(default=800)
    num_k_rerank = serializers.IntegerField(default=20)
    model_cohere_rerank = serializers.CharField(
        required=False, default="rerank-english-v2.0"
    )
    more_initial_chunks_for_reranking = serializers.IntegerField(default=100)
    claude_context_model = serializers.CharField(
        required=False, default="claude-3-haiku-20240307"
    )
    gpt_temperature = serializers.FloatField(default=0)
    id_modelo_do_usuario = serializers.IntegerField(required=False, default=11)
    should_have_contextual_chunks = serializers.BooleanField(default=False)  # type: ignore
    should_use_llama_parse = serializers.BooleanField(required=False, default=False)  # type: ignore
    llm_ultimas_requests = serializers.CharField(required=False, default="gpt-4o-mini")

    def get_obj(self):
        return GerarDocumentoSerializerData(**self.validated_data)  # type: ignore


@dataclass
class GerarDocumentoComPDFProprioSerializerData(GerarDocumentoInitialSerializerData):
    prompt_gerar_documento: Optional[str] = field(default=None)
    user_message: Optional[str] = field(default=None)
    num_chunks_retrieval: int = field(default=20)
    embedding_weight: float = field(default=0.5)
    bm25_weight: float = field(default=0.5)
    context_window: int = field(default=3)
    chunk_overlap: int = field(default=800)
    num_k_rerank: int = field(default=20)
    model_cohere_rerank: str = field(default="rerank-english-v2.0")
    more_initial_chunks_for_reranking: int = field(default=100)
    claude_context_model: str = field(default="claude-3-haiku-20240307")
    gpt_temperature: float = field(default=0.0)
    id_modelo_do_usuario: int = field(default=11)
    should_have_contextual_chunks: bool = field(default=False)
    should_use_llama_parse: bool = field(default=False)
    llm_ultimas_requests: str = field(default="gpt-4o-mini")


class GerarEmentaSerializer(serializers.Serializer):
    files = serializers.ListField(child=FileInfoSerializer(), required=True)
    user_message = serializers.CharField(required=False, default="")
    chunk_size = serializers.IntegerField(required=False, default=3500)
    chunk_overlap = serializers.IntegerField(required=False, default=800)
    bubble_editor_version = serializers.CharField(
        required=False, default="version-test"
    )  # Será o valor utilizado dentro da URL da requisição pro Bubble
    doc_id = serializers.CharField(required=True)
    form_response_id = serializers.CharField(required=True)
    version = serializers.CharField(required=True)