Spaces:
Running
Running
luanpoppe
commited on
Commit
·
c6dbb49
1
Parent(s):
ec8caf1
fix: lidar com arquivos grandes com muitos tokens
Browse files- _utils/gerar_relatorio_modelo_usuario/utils.py +94 -26
- _utils/main.py +0 -73
- _utils/resumo_completo_cursor.py +10 -6
- _utils/splitters/Splitter_class.py +4 -1
- gerar_documento/views.py +2 -2
_utils/gerar_relatorio_modelo_usuario/utils.py
CHANGED
@@ -89,35 +89,101 @@ def validate_many_chunks_in_one_request(
|
|
89 |
|
90 |
|
91 |
# Esta função gera a resposta que será usada em cada um das requisições de cada chunk
|
92 |
-
async def get_response_from_auxiliar_contextual_prompt(
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
100 |
|
101 |
-
|
|
|
102 |
|
103 |
-
|
104 |
-
# response_auxiliar_summary = await aclaude_answer(
|
105 |
-
# self.claude_client, self.claude_context_model, prompt_auxiliar_summary
|
106 |
-
# )
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
return
|
121 |
|
122 |
|
123 |
async def get_full_text_and_all_PDFs_chunks(
|
@@ -132,12 +198,14 @@ async def get_full_text_and_all_PDFs_chunks(
|
|
132 |
|
133 |
# Load and process document
|
134 |
for pdf_path in listaPDFs:
|
135 |
-
chunks, pages =
|
136 |
-
|
|
|
|
|
137 |
)
|
138 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
139 |
|
140 |
-
return all_PDFs_chunks, pages
|
141 |
|
142 |
|
143 |
async def generate_document_title(resumo_para_gerar_titulo: str):
|
|
|
89 |
|
90 |
|
91 |
# Esta função gera a resposta que será usada em cada um das requisições de cada chunk
|
92 |
+
# async def get_response_from_auxiliar_contextual_prompt(
|
93 |
+
# full_text_as_array: List[str], full_text_as_string: str
|
94 |
+
# ):
|
95 |
+
# print("full_text_as_string: ", full_text_as_string)
|
96 |
|
97 |
+
# print("\nCRIANDO PROMPT AUXILIAR DO CONTEXTUAL")
|
98 |
+
# # PROMPT PARA GERAR O RESUMO INICIAL DO PROCESSO
|
99 |
+
# prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(
|
100 |
+
# full_text_as_string
|
101 |
+
# )
|
102 |
|
103 |
+
# import tiktoken
|
104 |
+
# import re
|
105 |
|
106 |
+
# # full_text_as_string = re.sub(r"\s+", " ", full_text_as_string).strip()
|
|
|
|
|
|
|
107 |
|
108 |
+
# encoding = tiktoken.get_encoding("cl100k_base")
|
109 |
+
# # Count tokens
|
110 |
+
# num_tokens = len(encoding.encode(full_text_as_string))
|
111 |
+
# with open("output.txt", "w", encoding="utf-8") as file:
|
112 |
+
# file.write(full_text_as_string)
|
113 |
+
# print(f"CONTAGEM DE TOKENS - {num_tokens}")
|
114 |
|
115 |
+
# print("\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
|
116 |
+
|
117 |
+
# # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
|
118 |
+
# # response_auxiliar_summary = await aclaude_answer(
|
119 |
+
# # self.claude_client, self.claude_context_model, prompt_auxiliar_summary
|
120 |
+
# # )
|
121 |
+
|
122 |
+
# llms = LLM()
|
123 |
+
# print("\nCOMEÇANDO REQUISIÇÃO AUXILIAR DO CONTEXTUAL")
|
124 |
+
# response_auxiliar_summary = await llms.google_gemini().ainvoke(
|
125 |
+
# [HumanMessage(content=prompt_auxiliar_summary)]
|
126 |
+
# )
|
127 |
+
# print("TERMINOU REQUISIÇÃO AUXILIAR DO CONTEXTUAL")
|
128 |
+
|
129 |
+
# print(
|
130 |
+
# "\n\nresponse_auxiliar_summary.content[0:500]: ",
|
131 |
+
# response_auxiliar_summary.content[0:500],
|
132 |
+
# )
|
133 |
+
|
134 |
+
# return response_auxiliar_summary.content
|
135 |
+
|
136 |
+
|
137 |
+
async def get_response_from_auxiliar_contextual_prompt(
|
138 |
+
full_text_as_array: List[str], full_text_as_string: str
|
139 |
+
):
|
140 |
+
import tiktoken
|
141 |
+
|
142 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
143 |
+
llms = LLM()
|
144 |
+
responses = []
|
145 |
+
|
146 |
+
current_chunk = []
|
147 |
+
current_token_count = 0
|
148 |
+
chunk_counter = 1
|
149 |
+
|
150 |
+
for part in full_text_as_array:
|
151 |
+
part_tokens = len(encoding.encode(part))
|
152 |
+
|
153 |
+
# Check if adding this part would EXCEED the limit
|
154 |
+
if current_token_count + part_tokens > 600000:
|
155 |
+
# Process the accumulated chunk before it exceeds the limit
|
156 |
+
chunk_text = "".join(current_chunk)
|
157 |
+
print(
|
158 |
+
f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
|
159 |
+
)
|
160 |
+
|
161 |
+
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
162 |
+
response = await llms.google_gemini().ainvoke(
|
163 |
+
[HumanMessage(content=prompt)]
|
164 |
+
)
|
165 |
+
responses.append(response.content)
|
166 |
+
|
167 |
+
# Start new chunk with current part
|
168 |
+
current_chunk = [part]
|
169 |
+
current_token_count = part_tokens
|
170 |
+
chunk_counter += 1
|
171 |
+
else:
|
172 |
+
# Safe to add to current chunk
|
173 |
+
current_chunk.append(part)
|
174 |
+
current_token_count += part_tokens
|
175 |
+
|
176 |
+
# Process the final remaining chunk
|
177 |
+
if current_chunk:
|
178 |
+
chunk_text = "".join(current_chunk)
|
179 |
+
print(
|
180 |
+
f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
|
181 |
+
)
|
182 |
+
prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
|
183 |
+
response = await llms.google_gemini().ainvoke([HumanMessage(content=prompt)])
|
184 |
+
responses.append(response.content)
|
185 |
|
186 |
+
return "".join(responses)
|
187 |
|
188 |
|
189 |
async def get_full_text_and_all_PDFs_chunks(
|
|
|
198 |
|
199 |
# Load and process document
|
200 |
for pdf_path in listaPDFs:
|
201 |
+
chunks, pages, full_text_as_string = (
|
202 |
+
await splitterObject.load_and_split_document(
|
203 |
+
pdf_path, should_use_llama_parse, isBubble
|
204 |
+
)
|
205 |
)
|
206 |
all_PDFs_chunks = all_PDFs_chunks + chunks
|
207 |
|
208 |
+
return all_PDFs_chunks, pages, full_text_as_string
|
209 |
|
210 |
|
211 |
async def generate_document_title(resumo_para_gerar_titulo: str):
|
_utils/main.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from _utils.utils import create_prompt_llm_chain, create_retriever, getPDF, create_llm, create_prompt_llm_chain_summary, process_embedding_summary
|
3 |
-
from _utils import utils
|
4 |
-
from langchain.chains import create_retrieval_chain
|
5 |
-
from langchain_huggingface import HuggingFaceEmbeddings
|
6 |
-
from langchain_chroma import Chroma
|
7 |
-
from langchain_openai import OpenAIEmbeddings
|
8 |
-
from langchain.chains.summarize import load_summarize_chain
|
9 |
-
|
10 |
-
os.environ.get("OPENAI_API_KEY")
|
11 |
-
|
12 |
-
def get_llm_answer(system_prompt, user_prompt, pdf_url, model, embedding):
|
13 |
-
if embedding == "gpt":
|
14 |
-
embedding_object = OpenAIEmbeddings()
|
15 |
-
else:
|
16 |
-
embedding_object = HuggingFaceEmbeddings(model_name=embedding)
|
17 |
-
|
18 |
-
vectorstore = Chroma(
|
19 |
-
collection_name="documents",
|
20 |
-
embedding_function=embedding_object
|
21 |
-
)
|
22 |
-
|
23 |
-
print('model: ', model)
|
24 |
-
print('embedding: ', embedding)
|
25 |
-
pages = []
|
26 |
-
if pdf_url:
|
27 |
-
pages = getPDF(pdf_url)
|
28 |
-
else:
|
29 |
-
pages = getPDF()
|
30 |
-
retriever = create_retriever(pages, vectorstore)
|
31 |
-
rag_chain = create_retrieval_chain(retriever, create_prompt_llm_chain(system_prompt, model))
|
32 |
-
results = rag_chain.invoke({"input": user_prompt})
|
33 |
-
# print('allIds ARQUIVO MAIN: ', utils.allIds)
|
34 |
-
vectorstore.delete( utils.allIds)
|
35 |
-
vectorstore.delete_collection()
|
36 |
-
utils.allIds = []
|
37 |
-
# print('utils.allIds: ', utils.allIds)
|
38 |
-
return results
|
39 |
-
|
40 |
-
def get_llm_answer_summary(system_prompt, user_prompt, pdf_url, model, isIterativeRefinement):
|
41 |
-
print('model: ', model)
|
42 |
-
print('isIterativeRefinement: ', isIterativeRefinement)
|
43 |
-
print('\n\n\n')
|
44 |
-
pages = getPDF(pdf_url)
|
45 |
-
if not isIterativeRefinement:
|
46 |
-
rag_chain = create_prompt_llm_chain_summary(system_prompt, model)
|
47 |
-
|
48 |
-
results = rag_chain.invoke({"input": user_prompt, "context": pages})
|
49 |
-
|
50 |
-
return results
|
51 |
-
else:
|
52 |
-
chain = load_summarize_chain(create_llm(model), "refine", True)
|
53 |
-
result = chain.invoke({"input_documents": pages})
|
54 |
-
print('result: ', result)
|
55 |
-
return result
|
56 |
-
# Obs --> Para passar informações personalizadas --> chain = load_summarize_chain(llm, "refine", True, question_prompt=initial_prompt, refine_prompt=PromptTemplate.from_template(refine_prompt))
|
57 |
-
# Para ver mais opções --> Acessa a origem da função load_summarize_chain , e nela acessa a origem da função _load_refine_chain --> As opções são os parâmetros que esta última função recebe
|
58 |
-
|
59 |
-
def get_llm_answer_summary_with_embedding(system_prompt, user_prompt, pdf_url, model, isIterativeRefinement):
|
60 |
-
print('model: ', model)
|
61 |
-
print('isIterativeRefinement: ', isIterativeRefinement)
|
62 |
-
print('\n\n\n')
|
63 |
-
pages = getPDF(pdf_url)
|
64 |
-
full_texto = ""
|
65 |
-
for p in pages:
|
66 |
-
full_texto += p.page_content
|
67 |
-
print('full_texto: ', full_texto)
|
68 |
-
|
69 |
-
rag_chain = process_embedding_summary(system_prompt, model)
|
70 |
-
|
71 |
-
results = rag_chain.invoke({"input": user_prompt, "context": pages})
|
72 |
-
|
73 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_utils/resumo_completo_cursor.py
CHANGED
@@ -86,18 +86,22 @@ async def get_llm_summary_answer_by_cursor_complete(
|
|
86 |
reciprocal_rank_fusion=reciprocal_rank_fusion,
|
87 |
)
|
88 |
|
89 |
-
all_PDFs_chunks, full_text_as_array =
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
94 |
)
|
95 |
|
96 |
is_contextualized_chunk = serializer["should_have_contextual_chunks"]
|
97 |
|
98 |
if is_contextualized_chunk:
|
99 |
response_auxiliar_summary = (
|
100 |
-
await get_response_from_auxiliar_contextual_prompt(
|
|
|
|
|
101 |
)
|
102 |
|
103 |
print("\nCOMEÇANDO A FAZER AS REQUISIÇÕES DO CONTEXTUAL")
|
|
|
86 |
reciprocal_rank_fusion=reciprocal_rank_fusion,
|
87 |
)
|
88 |
|
89 |
+
all_PDFs_chunks, full_text_as_array, full_text_as_string = (
|
90 |
+
await get_full_text_and_all_PDFs_chunks(
|
91 |
+
listaPDFs,
|
92 |
+
summarizer.splitter,
|
93 |
+
serializer["should_use_llama_parse"],
|
94 |
+
isBubble,
|
95 |
+
)
|
96 |
)
|
97 |
|
98 |
is_contextualized_chunk = serializer["should_have_contextual_chunks"]
|
99 |
|
100 |
if is_contextualized_chunk:
|
101 |
response_auxiliar_summary = (
|
102 |
+
await get_response_from_auxiliar_contextual_prompt(
|
103 |
+
full_text_as_array, full_text_as_string
|
104 |
+
)
|
105 |
)
|
106 |
|
107 |
print("\nCOMEÇANDO A FAZER AS REQUISIÇÕES DO CONTEXTUAL")
|
_utils/splitters/Splitter_class.py
CHANGED
@@ -48,6 +48,9 @@ class Splitter:
|
|
48 |
page_boundaries, combined_text = (
|
49 |
combine_documents_without_losing_pagination(pages)
|
50 |
)
|
|
|
|
|
|
|
51 |
initial_chunks = initial_chunks + self.text_splitter.split_text(
|
52 |
combined_text
|
53 |
)
|
@@ -126,7 +129,7 @@ class Splitter:
|
|
126 |
# char_count += len(text)
|
127 |
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
|
128 |
|
129 |
-
return chunks, initial_chunks
|
130 |
|
131 |
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
|
132 |
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
|
|
|
48 |
page_boundaries, combined_text = (
|
49 |
combine_documents_without_losing_pagination(pages)
|
50 |
)
|
51 |
+
full_text_as_string = ""
|
52 |
+
for page in pages:
|
53 |
+
full_text_as_string = full_text_as_string + page.page_content
|
54 |
initial_chunks = initial_chunks + self.text_splitter.split_text(
|
55 |
combined_text
|
56 |
)
|
|
|
129 |
# char_count += len(text)
|
130 |
print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
|
131 |
|
132 |
+
return chunks, initial_chunks, full_text_as_string
|
133 |
|
134 |
def load_and_split_text(self, text: str) -> List[DocumentChunk]:
|
135 |
"""Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
|
gerar_documento/views.py
CHANGED
@@ -125,7 +125,7 @@ class GerarEmentaView(AsyncAPIView):
|
|
125 |
listaPDFs = [l["link_arquivo"] for l in data["files"]]
|
126 |
print("\n\nlistaPDFs: ", listaPDFs)
|
127 |
|
128 |
-
all_PDFs_chunks, full_text_as_array = (
|
129 |
await get_full_text_and_all_PDFs_chunks(
|
130 |
listaPDFs,
|
131 |
Splitter(data["chunk_size"], data["chunk_overlap"]),
|
@@ -177,7 +177,7 @@ class GerarEmentaComPDFProprioView(AsyncAPIView):
|
|
177 |
listaPDFs = [l["link_arquivo"] for l in data["files"]]
|
178 |
print("\n\nlistaPDFs: ", listaPDFs)
|
179 |
|
180 |
-
all_PDFs_chunks, full_text_as_array = (
|
181 |
await get_full_text_and_all_PDFs_chunks(
|
182 |
listaPDFs,
|
183 |
Splitter(data["chunk_size"], data["chunk_overlap"]),
|
|
|
125 |
listaPDFs = [l["link_arquivo"] for l in data["files"]]
|
126 |
print("\n\nlistaPDFs: ", listaPDFs)
|
127 |
|
128 |
+
all_PDFs_chunks, full_text_as_array, full_text_as_string = (
|
129 |
await get_full_text_and_all_PDFs_chunks(
|
130 |
listaPDFs,
|
131 |
Splitter(data["chunk_size"], data["chunk_overlap"]),
|
|
|
177 |
listaPDFs = [l["link_arquivo"] for l in data["files"]]
|
178 |
print("\n\nlistaPDFs: ", listaPDFs)
|
179 |
|
180 |
+
all_PDFs_chunks, full_text_as_array, full_text_as_string = (
|
181 |
await get_full_text_and_all_PDFs_chunks(
|
182 |
listaPDFs,
|
183 |
Splitter(data["chunk_size"], data["chunk_overlap"]),
|