Shreyas094 commited on
Commit
0e2e9a3
·
verified ·
1 Parent(s): 9f2051d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -455,18 +455,27 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
455
  if os.path.exists("faiss_database"):
456
  logging.info("Loading FAISS database")
457
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
 
 
 
 
 
458
  else:
459
  logging.warning("No FAISS database found")
460
  yield "No documents available. Please upload PDF documents to answer questions."
461
  return
462
 
463
- retriever = database.as_retriever(search_kwargs={"k": 10}) # Increased k to 10
464
  logging.info(f"Retrieving relevant documents for query: {query}")
465
  relevant_docs = retriever.get_relevant_documents(query)
466
  logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
467
 
 
 
 
 
468
  # Filter relevant_docs based on selected documents
469
- filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
470
  logging.info(f"Number of filtered documents: {len(filtered_docs)}")
471
 
472
  if not filtered_docs:
@@ -474,28 +483,28 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
474
  yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
475
  return
476
 
477
- for doc in filtered_docs:
478
- logging.info(f"Document source: {doc.metadata['source']}")
479
- logging.info(f"Document content preview: {doc.page_content[:100]}...")
 
 
 
 
 
 
 
480
 
481
- # Implement a sliding window approach for context
482
- max_context_length = 4000 # Adjust based on your model's capacity
483
  context_chunks = []
484
- current_chunk = ""
485
  for doc in filtered_docs:
486
- if len(current_chunk) + len(doc.page_content) > max_context_length:
487
- context_chunks.append(current_chunk)
488
- current_chunk = doc.page_content
489
- else:
490
- current_chunk += "\n" + doc.page_content
491
- if current_chunk:
492
- context_chunks.append(current_chunk)
493
 
494
- logging.info(f"Number of context chunks: {len(context_chunks)}")
495
 
496
  for i, context_str in enumerate(context_chunks):
497
  logging.info(f"Processing context chunk {i+1}/{len(context_chunks)}")
498
  logging.info(f"Context chunk length: {len(context_str)}")
 
499
 
500
  if model == "@cf/meta/llama-3.1-8b-instruct":
501
  logging.info("Using Cloudflare API")
@@ -514,7 +523,7 @@ Write a detailed and complete response that answers the following user question:
514
  logging.info(f"API call {j+1}/{num_calls}")
515
  for message in client.chat_completion(
516
  messages=[{"role": "user", "content": prompt}],
517
- max_tokens=10000,
518
  temperature=temperature,
519
  stream=True,
520
  ):
 
455
  if os.path.exists("faiss_database"):
456
  logging.info("Loading FAISS database")
457
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
458
+
459
+ # Inspect FAISS database
460
+ logging.info(f"FAISS database size: {len(database.docstore._dict)}")
461
+ for doc_id, doc in database.docstore._dict.items():
462
+ logging.info(f"Document ID: {doc_id}, Source: {doc.metadata.get('source', 'Unknown')}")
463
  else:
464
  logging.warning("No FAISS database found")
465
  yield "No documents available. Please upload PDF documents to answer questions."
466
  return
467
 
468
+ retriever = database.as_retriever(search_kwargs={"k": 20}) # Increased k to 20
469
  logging.info(f"Retrieving relevant documents for query: {query}")
470
  relevant_docs = retriever.get_relevant_documents(query)
471
  logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
472
 
473
+ # Log details of retrieved documents
474
+ for i, doc in enumerate(relevant_docs):
475
+ logging.info(f"Retrieved document {i+1}: Source: {doc.metadata.get('source', 'Unknown')}, Content preview: {doc.page_content[:100]}...")
476
+
477
  # Filter relevant_docs based on selected documents
478
+ filtered_docs = [doc for doc in relevant_docs if doc.metadata.get("source") in selected_docs]
479
  logging.info(f"Number of filtered documents: {len(filtered_docs)}")
480
 
481
  if not filtered_docs:
 
483
  yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
484
  return
485
 
486
+ # Implement a custom chunking strategy
487
+ def custom_chunk(text, chunk_size=1000, overlap=200):
488
+ chunks = []
489
+ start = 0
490
+ while start < len(text):
491
+ end = start + chunk_size
492
+ chunk = text[start:end]
493
+ chunks.append(chunk)
494
+ start = end - overlap
495
+ return chunks
496
 
 
 
497
  context_chunks = []
 
498
  for doc in filtered_docs:
499
+ doc_chunks = custom_chunk(doc.page_content)
500
+ context_chunks.extend(doc_chunks)
 
 
 
 
 
501
 
502
+ logging.info(f"Number of context chunks after custom chunking: {len(context_chunks)}")
503
 
504
  for i, context_str in enumerate(context_chunks):
505
  logging.info(f"Processing context chunk {i+1}/{len(context_chunks)}")
506
  logging.info(f"Context chunk length: {len(context_str)}")
507
+ logging.info(f"Context chunk preview: {context_str[:100]}...")
508
 
509
  if model == "@cf/meta/llama-3.1-8b-instruct":
510
  logging.info("Using Cloudflare API")
 
523
  logging.info(f"API call {j+1}/{num_calls}")
524
  for message in client.chat_completion(
525
  messages=[{"role": "user", "content": prompt}],
526
+ max_tokens=2000, # Reduced max_tokens to avoid potential errors
527
  temperature=temperature,
528
  stream=True,
529
  ):