Jatin Mehra commited on
Commit
4a31622
·
1 Parent(s): 24b32e6

Enhance PDF chunking logic and add validation for chat requests to improve data integrity and user experience

Browse files
Files changed (1) hide show
  1. app.py +29 -19
app.py CHANGED
@@ -178,16 +178,7 @@ async def upload_pdf(
178
  print("Warning: TAVILY_API_KEY is not set. Web search will not function.")
179
 
180
  documents = process_pdf_file(file_path)
181
- # Ensure max_length for chunk_text is appropriate.
182
- # The value 1500 might be too large if estimate_tokens is text_len // 4, as it means ~6000 characters.
183
- # Let's use a smaller max_length for chunks for better granularity in RAG retrieval.
184
- # For `bge-large-en-v1.5` (max sequence length 512 tokens), chunks around 250-400 tokens are often good.
185
- # If estimate_tokens is len(text)//4, then max_length of 250 tokens is roughly 1000 characters.
186
- # Let's use max_length=256 (tokens) for chunker config, so about 1024 characters.
187
- # The chunk_text function uses max_length as character count / 4. So if we want 256 tokens, max_length = 256*4 = 1024
188
- # However, the current chunk_text logic is `estimate_tokens(current_chunk + paragraph) <= max_length // 4`.
189
- # This means `max_length` is already considered a token limit. So `max_length=256` (tokens) is the target.
190
- chunks_with_metadata = chunk_text(documents, max_length=256) # max_length in tokens
191
 
192
  embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
193
  embeddings, _ = create_embeddings(chunks_with_metadata, embedding_model) # Chunks are already with metadata
@@ -222,11 +213,25 @@ async def upload_pdf(
222
  # Route to chat with the document
223
  @app.post("/chat")
224
  async def chat(request: ChatRequest):
 
 
 
 
 
 
 
225
  session, found = load_session(request.session_id, model_name=request.model_name)
226
  if not found:
227
  raise HTTPException(status_code=404, detail="Session not found or expired. Please upload a document first.")
228
 
229
  try:
 
 
 
 
 
 
 
230
  # Per-request memory to ensure chat history is correctly loaded for the agent
231
  agent_memory = ConversationBufferMemory(memory_key="chat_history", input_key="input", return_messages=True)
232
  for entry in session.get("chat_history", []):
@@ -237,15 +242,14 @@ async def chat(request: ChatRequest):
237
  current_request_tools = []
238
 
239
  # 1. Add the document-specific vector search tool
240
- if "index" in session and "chunks" in session and "model" in session:
241
- vector_search_tool_instance = create_vector_search_tool(
242
- faiss_index=session["index"],
243
- document_chunks_with_metadata=session["chunks"], # Pass the correct variable
244
- embedding_model=session["model"] # This is the SentenceTransformer model
245
- )
246
- current_request_tools.append(vector_search_tool_instance)
247
- else:
248
- print(f"Warning: Session {request.session_id} missing data for vector_database_search tool.")
249
 
250
  # 2. Conditionally add Tavily (web search) tool
251
  if request.use_search:
@@ -270,6 +274,10 @@ async def chat(request: ChatRequest):
270
  k=5 # Number of chunks for initial context
271
  )
272
 
 
 
 
 
273
  response = agentic_rag(
274
  session["llm"],
275
  current_request_tools, # Pass the dynamically assembled list of tools
@@ -280,6 +288,8 @@ async def chat(request: ChatRequest):
280
  )
281
 
282
  response_output = response.get("output", "Sorry, I could not generate a response.")
 
 
283
  session["chat_history"].append({"user": request.query, "assistant": response_output})
284
  save_session(request.session_id, session) # Save updated history and potentially other modified session state
285
 
 
178
  print("Warning: TAVILY_API_KEY is not set. Web search will not function.")
179
 
180
  documents = process_pdf_file(file_path)
181
+ chunks_with_metadata = chunk_text(documents, max_length=1000) # Increased from 256 to 1000 tokens for better context
 
 
 
 
 
 
 
 
 
182
 
183
  embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')
184
  embeddings, _ = create_embeddings(chunks_with_metadata, embedding_model) # Chunks are already with metadata
 
213
  # Route to chat with the document
214
  @app.post("/chat")
215
  async def chat(request: ChatRequest):
216
+ # Validate query
217
+ if not request.query or not request.query.strip():
218
+ raise HTTPException(status_code=400, detail="Query cannot be empty")
219
+
220
+ if len(request.query.strip()) < 3:
221
+ raise HTTPException(status_code=400, detail="Query must be at least 3 characters long")
222
+
223
  session, found = load_session(request.session_id, model_name=request.model_name)
224
  if not found:
225
  raise HTTPException(status_code=404, detail="Session not found or expired. Please upload a document first.")
226
 
227
  try:
228
+ # Validate session data integrity
229
+ required_keys = ["index", "chunks", "model", "llm"]
230
+ missing_keys = [key for key in required_keys if key not in session]
231
+ if missing_keys:
232
+ print(f"Warning: Session {request.session_id} missing required data: {missing_keys}")
233
+ raise HTTPException(status_code=500, detail="Session data is incomplete. Please upload the document again.")
234
+
235
  # Per-request memory to ensure chat history is correctly loaded for the agent
236
  agent_memory = ConversationBufferMemory(memory_key="chat_history", input_key="input", return_messages=True)
237
  for entry in session.get("chat_history", []):
 
242
  current_request_tools = []
243
 
244
  # 1. Add the document-specific vector search tool
245
+ vector_search_tool_instance = create_vector_search_tool(
246
+ faiss_index=session["index"],
247
+ document_chunks_with_metadata=session["chunks"], # Pass the correct variable
248
+ embedding_model=session["model"], # This is the SentenceTransformer model
249
+ max_chunk_length=1000,
250
+ k=10
251
+ )
252
+ current_request_tools.append(vector_search_tool_instance)
 
253
 
254
  # 2. Conditionally add Tavily (web search) tool
255
  if request.use_search:
 
274
  k=5 # Number of chunks for initial context
275
  )
276
 
277
+ print(f"Query: '{request.query}' - Found {len(initial_similar_chunks)} initial chunks")
278
+ if initial_similar_chunks:
279
+ print(f"Best chunk score: {initial_similar_chunks[0][1]:.4f}")
280
+
281
  response = agentic_rag(
282
  session["llm"],
283
  current_request_tools, # Pass the dynamically assembled list of tools
 
288
  )
289
 
290
  response_output = response.get("output", "Sorry, I could not generate a response.")
291
+ print(f"Generated response length: {len(response_output)} characters")
292
+
293
  session["chat_history"].append({"user": request.query, "assistant": response_output})
294
  save_session(request.session_id, session) # Save updated history and potentially other modified session state
295