luanpoppe commited on
Commit
c6dbb49
·
1 Parent(s): ec8caf1

fix: lidar com arquivos grandes com muitos tokens

Browse files
_utils/gerar_relatorio_modelo_usuario/utils.py CHANGED
@@ -89,35 +89,101 @@ def validate_many_chunks_in_one_request(
89
 
90
 
91
  # Esta função gera a resposta que será usada em cada um das requisições de cada chunk
92
- async def get_response_from_auxiliar_contextual_prompt(full_text_as_array: List[str]):
93
- full_text = ""
94
- for x in full_text_as_array:
95
- full_text += x
96
 
97
- print("\nCRIANDO PROMPT AUXILIAR DO CONTEXTUAL")
98
- # PROMPT PARA GERAR O RESUMO INICIAL DO PROCESSO
99
- prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(full_text)
 
 
100
 
101
- print("\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
 
102
 
103
- # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
104
- # response_auxiliar_summary = await aclaude_answer(
105
- # self.claude_client, self.claude_context_model, prompt_auxiliar_summary
106
- # )
107
 
108
- llms = LLM()
109
- print("\nCOMEÇANDO REQUISIÇÃO AUXILIAR DO CONTEXTUAL")
110
- response_auxiliar_summary = await llms.google_gemini().ainvoke(
111
- [HumanMessage(content=prompt_auxiliar_summary)]
112
- )
113
- print("TERMINOU REQUISIÇÃO AUXILIAR DO CONTEXTUAL")
114
 
115
- print(
116
- "\n\nresponse_auxiliar_summary.content[0:500]: ",
117
- response_auxiliar_summary.content[0:500],
118
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- return response_auxiliar_summary.content
121
 
122
 
123
  async def get_full_text_and_all_PDFs_chunks(
@@ -132,12 +198,14 @@ async def get_full_text_and_all_PDFs_chunks(
132
 
133
  # Load and process document
134
  for pdf_path in listaPDFs:
135
- chunks, pages = await splitterObject.load_and_split_document(
136
- pdf_path, should_use_llama_parse, isBubble
 
 
137
  )
138
  all_PDFs_chunks = all_PDFs_chunks + chunks
139
 
140
- return all_PDFs_chunks, pages
141
 
142
 
143
  async def generate_document_title(resumo_para_gerar_titulo: str):
 
89
 
90
 
91
  # Esta função gera a resposta que será usada em cada um das requisições de cada chunk
92
+ # async def get_response_from_auxiliar_contextual_prompt(
93
+ # full_text_as_array: List[str], full_text_as_string: str
94
+ # ):
95
+ # print("full_text_as_string: ", full_text_as_string)
96
 
97
+ # print("\nCRIANDO PROMPT AUXILIAR DO CONTEXTUAL")
98
+ # # PROMPT PARA GERAR O RESUMO INICIAL DO PROCESSO
99
+ # prompt_auxiliar_summary = create_prompt_auxiliar_do_contextual_prompt(
100
+ # full_text_as_string
101
+ # )
102
 
103
+ # import tiktoken
104
+ # import re
105
 
106
+ # # full_text_as_string = re.sub(r"\s+", " ", full_text_as_string).strip()
 
 
 
107
 
108
+ # encoding = tiktoken.get_encoding("cl100k_base")
109
+ # # Count tokens
110
+ # num_tokens = len(encoding.encode(full_text_as_string))
111
+ # with open("output.txt", "w", encoding="utf-8") as file:
112
+ # file.write(full_text_as_string)
113
+ # print(f"CONTAGEM DE TOKENS - {num_tokens}")
114
 
115
+ # print("\n\nprompt_auxiliar_summary[0:500]: ", prompt_auxiliar_summary[0:500])
116
+
117
+ # # Claude comentado pois o limite de tokens estava sendo passado pela requisição e dava erro
118
+ # # response_auxiliar_summary = await aclaude_answer(
119
+ # # self.claude_client, self.claude_context_model, prompt_auxiliar_summary
120
+ # # )
121
+
122
+ # llms = LLM()
123
+ # print("\nCOMEÇANDO REQUISIÇÃO AUXILIAR DO CONTEXTUAL")
124
+ # response_auxiliar_summary = await llms.google_gemini().ainvoke(
125
+ # [HumanMessage(content=prompt_auxiliar_summary)]
126
+ # )
127
+ # print("TERMINOU REQUISIÇÃO AUXILIAR DO CONTEXTUAL")
128
+
129
+ # print(
130
+ # "\n\nresponse_auxiliar_summary.content[0:500]: ",
131
+ # response_auxiliar_summary.content[0:500],
132
+ # )
133
+
134
+ # return response_auxiliar_summary.content
135
+
136
+
137
+ async def get_response_from_auxiliar_contextual_prompt(
138
+ full_text_as_array: List[str], full_text_as_string: str
139
+ ):
140
+ import tiktoken
141
+
142
+ encoding = tiktoken.get_encoding("cl100k_base")
143
+ llms = LLM()
144
+ responses = []
145
+
146
+ current_chunk = []
147
+ current_token_count = 0
148
+ chunk_counter = 1
149
+
150
+ for part in full_text_as_array:
151
+ part_tokens = len(encoding.encode(part))
152
+
153
+ # Check if adding this part would EXCEED the limit
154
+ if current_token_count + part_tokens > 600000:
155
+ # Process the accumulated chunk before it exceeds the limit
156
+ chunk_text = "".join(current_chunk)
157
+ print(
158
+ f"\nProcessing chunk {chunk_counter} with {current_token_count} tokens"
159
+ )
160
+
161
+ prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
162
+ response = await llms.google_gemini().ainvoke(
163
+ [HumanMessage(content=prompt)]
164
+ )
165
+ responses.append(response.content)
166
+
167
+ # Start new chunk with current part
168
+ current_chunk = [part]
169
+ current_token_count = part_tokens
170
+ chunk_counter += 1
171
+ else:
172
+ # Safe to add to current chunk
173
+ current_chunk.append(part)
174
+ current_token_count += part_tokens
175
+
176
+ # Process the final remaining chunk
177
+ if current_chunk:
178
+ chunk_text = "".join(current_chunk)
179
+ print(
180
+ f"\nProcessing final chunk {chunk_counter} with {current_token_count} tokens"
181
+ )
182
+ prompt = create_prompt_auxiliar_do_contextual_prompt(chunk_text)
183
+ response = await llms.google_gemini().ainvoke([HumanMessage(content=prompt)])
184
+ responses.append(response.content)
185
 
186
+ return "".join(responses)
187
 
188
 
189
  async def get_full_text_and_all_PDFs_chunks(
 
198
 
199
  # Load and process document
200
  for pdf_path in listaPDFs:
201
+ chunks, pages, full_text_as_string = (
202
+ await splitterObject.load_and_split_document(
203
+ pdf_path, should_use_llama_parse, isBubble
204
+ )
205
  )
206
  all_PDFs_chunks = all_PDFs_chunks + chunks
207
 
208
+ return all_PDFs_chunks, pages, full_text_as_string
209
 
210
 
211
  async def generate_document_title(resumo_para_gerar_titulo: str):
_utils/main.py DELETED
@@ -1,73 +0,0 @@
1
- import os
2
- from _utils.utils import create_prompt_llm_chain, create_retriever, getPDF, create_llm, create_prompt_llm_chain_summary, process_embedding_summary
3
- from _utils import utils
4
- from langchain.chains import create_retrieval_chain
5
- from langchain_huggingface import HuggingFaceEmbeddings
6
- from langchain_chroma import Chroma
7
- from langchain_openai import OpenAIEmbeddings
8
- from langchain.chains.summarize import load_summarize_chain
9
-
10
- os.environ.get("OPENAI_API_KEY")
11
-
12
- def get_llm_answer(system_prompt, user_prompt, pdf_url, model, embedding):
13
- if embedding == "gpt":
14
- embedding_object = OpenAIEmbeddings()
15
- else:
16
- embedding_object = HuggingFaceEmbeddings(model_name=embedding)
17
-
18
- vectorstore = Chroma(
19
- collection_name="documents",
20
- embedding_function=embedding_object
21
- )
22
-
23
- print('model: ', model)
24
- print('embedding: ', embedding)
25
- pages = []
26
- if pdf_url:
27
- pages = getPDF(pdf_url)
28
- else:
29
- pages = getPDF()
30
- retriever = create_retriever(pages, vectorstore)
31
- rag_chain = create_retrieval_chain(retriever, create_prompt_llm_chain(system_prompt, model))
32
- results = rag_chain.invoke({"input": user_prompt})
33
- # print('allIds ARQUIVO MAIN: ', utils.allIds)
34
- vectorstore.delete( utils.allIds)
35
- vectorstore.delete_collection()
36
- utils.allIds = []
37
- # print('utils.allIds: ', utils.allIds)
38
- return results
39
-
40
- def get_llm_answer_summary(system_prompt, user_prompt, pdf_url, model, isIterativeRefinement):
41
- print('model: ', model)
42
- print('isIterativeRefinement: ', isIterativeRefinement)
43
- print('\n\n\n')
44
- pages = getPDF(pdf_url)
45
- if not isIterativeRefinement:
46
- rag_chain = create_prompt_llm_chain_summary(system_prompt, model)
47
-
48
- results = rag_chain.invoke({"input": user_prompt, "context": pages})
49
-
50
- return results
51
- else:
52
- chain = load_summarize_chain(create_llm(model), "refine", True)
53
- result = chain.invoke({"input_documents": pages})
54
- print('result: ', result)
55
- return result
56
- # Obs --> Para passar informações personalizadas --> chain = load_summarize_chain(llm, "refine", True, question_prompt=initial_prompt, refine_prompt=PromptTemplate.from_template(refine_prompt))
57
- # Para ver mais opções --> Acessa a origem da função load_summarize_chain , e nela acessa a origem da função _load_refine_chain --> As opções são os parâmetros que esta última função recebe
58
-
59
- def get_llm_answer_summary_with_embedding(system_prompt, user_prompt, pdf_url, model, isIterativeRefinement):
60
- print('model: ', model)
61
- print('isIterativeRefinement: ', isIterativeRefinement)
62
- print('\n\n\n')
63
- pages = getPDF(pdf_url)
64
- full_texto = ""
65
- for p in pages:
66
- full_texto += p.page_content
67
- print('full_texto: ', full_texto)
68
-
69
- rag_chain = process_embedding_summary(system_prompt, model)
70
-
71
- results = rag_chain.invoke({"input": user_prompt, "context": pages})
72
-
73
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
_utils/resumo_completo_cursor.py CHANGED
@@ -86,18 +86,22 @@ async def get_llm_summary_answer_by_cursor_complete(
86
  reciprocal_rank_fusion=reciprocal_rank_fusion,
87
  )
88
 
89
- all_PDFs_chunks, full_text_as_array = await get_full_text_and_all_PDFs_chunks(
90
- listaPDFs,
91
- summarizer.splitter,
92
- serializer["should_use_llama_parse"],
93
- isBubble,
 
 
94
  )
95
 
96
  is_contextualized_chunk = serializer["should_have_contextual_chunks"]
97
 
98
  if is_contextualized_chunk:
99
  response_auxiliar_summary = (
100
- await get_response_from_auxiliar_contextual_prompt(full_text_as_array)
 
 
101
  )
102
 
103
  print("\nCOMEÇANDO A FAZER AS REQUISIÇÕES DO CONTEXTUAL")
 
86
  reciprocal_rank_fusion=reciprocal_rank_fusion,
87
  )
88
 
89
+ all_PDFs_chunks, full_text_as_array, full_text_as_string = (
90
+ await get_full_text_and_all_PDFs_chunks(
91
+ listaPDFs,
92
+ summarizer.splitter,
93
+ serializer["should_use_llama_parse"],
94
+ isBubble,
95
+ )
96
  )
97
 
98
  is_contextualized_chunk = serializer["should_have_contextual_chunks"]
99
 
100
  if is_contextualized_chunk:
101
  response_auxiliar_summary = (
102
+ await get_response_from_auxiliar_contextual_prompt(
103
+ full_text_as_array, full_text_as_string
104
+ )
105
  )
106
 
107
  print("\nCOMEÇANDO A FAZER AS REQUISIÇÕES DO CONTEXTUAL")
_utils/splitters/Splitter_class.py CHANGED
@@ -48,6 +48,9 @@ class Splitter:
48
  page_boundaries, combined_text = (
49
  combine_documents_without_losing_pagination(pages)
50
  )
 
 
 
51
  initial_chunks = initial_chunks + self.text_splitter.split_text(
52
  combined_text
53
  )
@@ -126,7 +129,7 @@ class Splitter:
126
  # char_count += len(text)
127
  print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
128
 
129
- return chunks, initial_chunks
130
 
131
  def load_and_split_text(self, text: str) -> List[DocumentChunk]:
132
  """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
 
48
  page_boundaries, combined_text = (
49
  combine_documents_without_losing_pagination(pages)
50
  )
51
+ full_text_as_string = ""
52
+ for page in pages:
53
+ full_text_as_string = full_text_as_string + page.page_content
54
  initial_chunks = initial_chunks + self.text_splitter.split_text(
55
  combined_text
56
  )
 
129
  # char_count += len(text)
130
  print("TERMINOU DE ORGANIZAR PDFS EM CHUNKS")
131
 
132
+ return chunks, initial_chunks, full_text_as_string
133
 
134
  def load_and_split_text(self, text: str) -> List[DocumentChunk]:
135
  """Load Text and split into chunks with metadata - Criei essa função apenas para o ragas"""
gerar_documento/views.py CHANGED
@@ -125,7 +125,7 @@ class GerarEmentaView(AsyncAPIView):
125
  listaPDFs = [l["link_arquivo"] for l in data["files"]]
126
  print("\n\nlistaPDFs: ", listaPDFs)
127
 
128
- all_PDFs_chunks, full_text_as_array = (
129
  await get_full_text_and_all_PDFs_chunks(
130
  listaPDFs,
131
  Splitter(data["chunk_size"], data["chunk_overlap"]),
@@ -177,7 +177,7 @@ class GerarEmentaComPDFProprioView(AsyncAPIView):
177
  listaPDFs = [l["link_arquivo"] for l in data["files"]]
178
  print("\n\nlistaPDFs: ", listaPDFs)
179
 
180
- all_PDFs_chunks, full_text_as_array = (
181
  await get_full_text_and_all_PDFs_chunks(
182
  listaPDFs,
183
  Splitter(data["chunk_size"], data["chunk_overlap"]),
 
125
  listaPDFs = [l["link_arquivo"] for l in data["files"]]
126
  print("\n\nlistaPDFs: ", listaPDFs)
127
 
128
+ all_PDFs_chunks, full_text_as_array, full_text_as_string = (
129
  await get_full_text_and_all_PDFs_chunks(
130
  listaPDFs,
131
  Splitter(data["chunk_size"], data["chunk_overlap"]),
 
177
  listaPDFs = [l["link_arquivo"] for l in data["files"]]
178
  print("\n\nlistaPDFs: ", listaPDFs)
179
 
180
+ all_PDFs_chunks, full_text_as_array, full_text_as_string = (
181
  await get_full_text_and_all_PDFs_chunks(
182
  listaPDFs,
183
  Splitter(data["chunk_size"], data["chunk_overlap"]),