Spaces:
Sleeping
Sleeping
luanpoppe
commited on
Commit
·
234f840
1
Parent(s):
55f46c1
feat: adicionando melhorias e correções do contextual retriever
Browse files
_antigos/resumos/serializer.py
CHANGED
|
@@ -25,5 +25,5 @@ class ResumoCursorSerializer(serializers.Serializer):
|
|
| 25 |
user_message = serializers.CharField(required=False, default="")
|
| 26 |
model = serializers.CharField(required=False, default=default_model)
|
| 27 |
hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
|
| 28 |
-
chunk_size = serializers.IntegerField(required=False, default=
|
| 29 |
-
chunk_overlap = serializers.IntegerField(required=False, default=
|
|
|
|
| 25 |
user_message = serializers.CharField(required=False, default="")
|
| 26 |
model = serializers.CharField(required=False, default=default_model)
|
| 27 |
hf_embedding = serializers.CharField(required=False, default="all-MiniLM-L6-v2")
|
| 28 |
+
chunk_size = serializers.IntegerField(required=False, default=5000)
|
| 29 |
+
chunk_overlap = serializers.IntegerField(required=False, default=1600)
|
_utils/gerar_relatorio_modelo_usuario/contextual_retriever.py
CHANGED
|
@@ -174,7 +174,8 @@ class ContextualRetriever:
|
|
| 174 |
ContextualizedChunk(
|
| 175 |
content=chunk.content,
|
| 176 |
page_number=chunk.page_number,
|
| 177 |
-
|
|
|
|
| 178 |
start_char=chunk.start_char,
|
| 179 |
end_char=chunk.end_char,
|
| 180 |
context=" ".join(result[index][1:2]),
|
|
|
|
| 174 |
ContextualizedChunk(
|
| 175 |
content=chunk.content,
|
| 176 |
page_number=chunk.page_number,
|
| 177 |
+
id_do_processo=result[index][0],
|
| 178 |
+
chunk_id=chunk.chunk_id,
|
| 179 |
start_char=chunk.start_char,
|
| 180 |
end_char=chunk.end_char,
|
| 181 |
context=" ".join(result[index][1:2]),
|
_utils/models/gerar_relatorio.py
CHANGED
|
@@ -10,6 +10,7 @@ class DocumentChunk:
|
|
| 10 |
chunk_id: str
|
| 11 |
start_char: int
|
| 12 |
end_char: int
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
@dataclass
|
|
|
|
| 10 |
chunk_id: str
|
| 11 |
start_char: int
|
| 12 |
end_char: int
|
| 13 |
+
id_do_processo: int = 0
|
| 14 |
|
| 15 |
|
| 16 |
@dataclass
|
_utils/vector_stores/Vector_store_class.py
CHANGED
|
@@ -21,7 +21,7 @@ class VectorStore:
|
|
| 21 |
# Prepare texts with context
|
| 22 |
if is_contextualized_chunk:
|
| 23 |
texts = [
|
| 24 |
-
f"Document_id: {chunk.
|
| 25 |
for chunk in chunks
|
| 26 |
]
|
| 27 |
else:
|
|
@@ -35,6 +35,7 @@ class VectorStore:
|
|
| 35 |
metadatas.append(
|
| 36 |
{
|
| 37 |
"chunk_id": chunk.chunk_id,
|
|
|
|
| 38 |
"page": chunk.page_number,
|
| 39 |
"start_char": chunk.start_char,
|
| 40 |
"end_char": chunk.end_char,
|
|
@@ -46,6 +47,7 @@ class VectorStore:
|
|
| 46 |
metadatas.append(
|
| 47 |
{
|
| 48 |
"chunk_id": chunk.chunk_id,
|
|
|
|
| 49 |
"page": chunk.page_number,
|
| 50 |
"start_char": chunk.start_char,
|
| 51 |
"end_char": chunk.end_char,
|
|
|
|
| 21 |
# Prepare texts with context
|
| 22 |
if is_contextualized_chunk:
|
| 23 |
texts = [
|
| 24 |
+
f"Document_id: {chunk.id_do_processo}\nDocument_context: {chunk.context}\nDocument_content: {chunk.content}"
|
| 25 |
for chunk in chunks
|
| 26 |
]
|
| 27 |
else:
|
|
|
|
| 35 |
metadatas.append(
|
| 36 |
{
|
| 37 |
"chunk_id": chunk.chunk_id,
|
| 38 |
+
"id_do_processo": chunk.id_do_processo,
|
| 39 |
"page": chunk.page_number,
|
| 40 |
"start_char": chunk.start_char,
|
| 41 |
"end_char": chunk.end_char,
|
|
|
|
| 47 |
metadatas.append(
|
| 48 |
{
|
| 49 |
"chunk_id": chunk.chunk_id,
|
| 50 |
+
"id_do_processo": chunk.id_do_processo,
|
| 51 |
"page": chunk.page_number,
|
| 52 |
"start_char": chunk.start_char,
|
| 53 |
"end_char": chunk.end_char,
|
gerar_documento/serializer.py
CHANGED
|
@@ -33,7 +33,7 @@ class GerarDocumentoSerializer(ResumoCursorSerializer):
|
|
| 33 |
embedding_weight = serializers.FloatField(default=0.5)
|
| 34 |
bm25_weight = serializers.FloatField(default=0.5)
|
| 35 |
context_window = serializers.IntegerField(default=3)
|
| 36 |
-
chunk_overlap = serializers.IntegerField(default=
|
| 37 |
num_k_rerank = serializers.IntegerField(default=20)
|
| 38 |
model_cohere_rerank = serializers.CharField(
|
| 39 |
required=False, default="rerank-english-v2.0"
|
|
@@ -61,7 +61,7 @@ class GerarDocumentoComPDFProprioSerializer(ResumoCursorSerializer):
|
|
| 61 |
embedding_weight = serializers.FloatField(default=0.5)
|
| 62 |
bm25_weight = serializers.FloatField(default=0.5)
|
| 63 |
context_window = serializers.IntegerField(default=3)
|
| 64 |
-
chunk_overlap = serializers.IntegerField(default=
|
| 65 |
num_k_rerank = serializers.IntegerField(default=20)
|
| 66 |
model_cohere_rerank = serializers.CharField(
|
| 67 |
required=False, default="rerank-english-v2.0"
|
|
|
|
| 33 |
embedding_weight = serializers.FloatField(default=0.5)
|
| 34 |
bm25_weight = serializers.FloatField(default=0.5)
|
| 35 |
context_window = serializers.IntegerField(default=3)
|
| 36 |
+
chunk_overlap = serializers.IntegerField(default=1600)
|
| 37 |
num_k_rerank = serializers.IntegerField(default=20)
|
| 38 |
model_cohere_rerank = serializers.CharField(
|
| 39 |
required=False, default="rerank-english-v2.0"
|
|
|
|
| 61 |
embedding_weight = serializers.FloatField(default=0.5)
|
| 62 |
bm25_weight = serializers.FloatField(default=0.5)
|
| 63 |
context_window = serializers.IntegerField(default=3)
|
| 64 |
+
chunk_overlap = serializers.IntegerField(default=1600)
|
| 65 |
num_k_rerank = serializers.IntegerField(default=20)
|
| 66 |
model_cohere_rerank = serializers.CharField(
|
| 67 |
required=False, default="rerank-english-v2.0"
|