Spaces:
Running
Running
luanpoppe
commited on
Commit
·
93c6cb3
1
Parent(s):
451f8a3
fix: one chunks
Browse files
_utils/gerar_documento.py
CHANGED
@@ -55,13 +55,11 @@ async def gerar_documento(
|
|
55 |
# Initialize enhanced summarizer
|
56 |
summarizer = GerarDocumento(serializer)
|
57 |
|
58 |
-
all_PDFs_chunks, full_text_as_array
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
isBubble,
|
64 |
-
)
|
65 |
)
|
66 |
|
67 |
is_contextualized_chunk = serializer.should_have_contextual_chunks
|
@@ -104,7 +102,7 @@ async def gerar_documento(
|
|
104 |
# Create enhanced vector store and BM25 index
|
105 |
vector_store, bm25, chunk_ids = (
|
106 |
summarizer.vector_store.create_enhanced_vector_store(
|
107 |
-
chunks_processados, is_contextualized_chunk
|
108 |
)
|
109 |
)
|
110 |
|
|
|
55 |
# Initialize enhanced summarizer
|
56 |
summarizer = GerarDocumento(serializer)
|
57 |
|
58 |
+
all_PDFs_chunks, full_text_as_array = await get_full_text_and_all_PDFs_chunks(
|
59 |
+
listaPDFs,
|
60 |
+
summarizer.splitter,
|
61 |
+
serializer.should_use_llama_parse,
|
62 |
+
isBubble,
|
|
|
|
|
63 |
)
|
64 |
|
65 |
is_contextualized_chunk = serializer.should_have_contextual_chunks
|
|
|
102 |
# Create enhanced vector store and BM25 index
|
103 |
vector_store, bm25, chunk_ids = (
|
104 |
summarizer.vector_store.create_enhanced_vector_store(
|
105 |
+
chunks_processados, is_contextualized_chunk, axiom_instance
|
106 |
)
|
107 |
)
|
108 |
|
_utils/gerar_relatorio_modelo_usuario/utils.py
CHANGED
@@ -158,7 +158,7 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
158 |
)
|
159 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
160 |
|
161 |
-
return all_PDFs_chunks, pages
|
162 |
|
163 |
|
164 |
async def generate_document_title(resumo_para_gerar_titulo: str):
|
|
|
158 |
)
|
159 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
160 |
|
161 |
+
return all_PDFs_chunks, pages
|
162 |
|
163 |
|
164 |
async def generate_document_title(resumo_para_gerar_titulo: str):
|
_utils/langchain_utils/Splitter_class.py
CHANGED
@@ -41,7 +41,6 @@ class Splitter:
|
|
41 |
# pages = get_pdf_from_bubble(
|
42 |
# pdf_path
|
43 |
# ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
|
44 |
-
full_text_as_string = ""
|
45 |
|
46 |
chunks_of_string_only: List[str] = []
|
47 |
|
@@ -137,7 +136,7 @@ class Splitter:
|
|
137 |
# char_count += len(text)
|
138 |
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
|
139 |
|
140 |
-
return chunks, chunks_of_string_only
|
141 |
|
142 |
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
|
143 |
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
|
|
|
41 |
# pages = get_pdf_from_bubble(
|
42 |
# pdf_path
|
43 |
# ) # Gera uma lista de objetos Document, sendo cada item da lista referente a UMA PÁGINA inteira do PDF.
|
|
|
44 |
|
45 |
chunks_of_string_only: List[str] = []
|
46 |
|
|
|
136 |
# char_count += len(text)
|
137 |
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
|
138 |
|
139 |
+
return chunks, chunks_of_string_only
|
140 |
|
141 |
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
|
142 |
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
|
_utils/langchain_utils/Vector_store_class.py
CHANGED
@@ -6,6 +6,8 @@ from _utils.models.gerar_relatorio import (
|
|
6 |
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
|
7 |
import logging
|
8 |
|
|
|
|
|
9 |
|
10 |
class VectorStore:
|
11 |
def __init__(self, embedding_model):
|
@@ -14,7 +16,10 @@ class VectorStore:
|
|
14 |
pass
|
15 |
|
16 |
def create_enhanced_vector_store(
|
17 |
-
self,
|
|
|
|
|
|
|
18 |
) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
19 |
"""Create vector store and BM25 index with contextualized chunks"""
|
20 |
try:
|
@@ -23,7 +28,7 @@ class VectorStore:
|
|
23 |
texts = [
|
24 |
f"""<one_chunk>
|
25 |
<document_id>{chunk.id_do_processo}</document_id>
|
26 |
-
<document_context_title>
|
27 |
<document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary>
|
28 |
<document_content>Document_content: {chunk.content}</document_content>
|
29 |
</one_chunk>
|
@@ -31,6 +36,7 @@ class VectorStore:
|
|
31 |
"""
|
32 |
for chunk in chunks
|
33 |
]
|
|
|
34 |
else:
|
35 |
texts = [f"{chunk.content}" for chunk in chunks]
|
36 |
|
|
|
6 |
from setup.easy_imports import Chroma, BM25Okapi, HuggingFaceEmbeddings
|
7 |
import logging
|
8 |
|
9 |
+
from setup.logging import Axiom
|
10 |
+
|
11 |
|
12 |
class VectorStore:
|
13 |
def __init__(self, embedding_model):
|
|
|
16 |
pass
|
17 |
|
18 |
def create_enhanced_vector_store(
|
19 |
+
self,
|
20 |
+
chunks: List[ContextualizedChunk],
|
21 |
+
is_contextualized_chunk,
|
22 |
+
axiom_instance: Axiom,
|
23 |
) -> Tuple[Chroma, BM25Okapi, List[str]]:
|
24 |
"""Create vector store and BM25 index with contextualized chunks"""
|
25 |
try:
|
|
|
28 |
texts = [
|
29 |
f"""<one_chunk>
|
30 |
<document_id>{chunk.id_do_processo}</document_id>
|
31 |
+
<document_context_title>{chunk.context}</document_context_title>
|
32 |
<document_contextual_summary>{chunk.contextual_summary}</document_contextual_summary>
|
33 |
<document_content>Document_content: {chunk.content}</document_content>
|
34 |
</one_chunk>
|
|
|
36 |
"""
|
37 |
for chunk in chunks
|
38 |
]
|
39 |
+
axiom_instance.send_axiom(f"Chunks gerados: {texts}")
|
40 |
else:
|
41 |
texts = [f"{chunk.content}" for chunk in chunks]
|
42 |
|