Shreyas094 commited on
Commit
ef44cd9
·
verified ·
1 Parent(s): feeb0e7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -32
app.py CHANGED
@@ -328,6 +328,12 @@ def google_search(term, num_results=3, lang="en", timeout=5, safe="active", ssl_
328
 
329
  return all_results
330
 
 
 
 
 
 
 
331
  def estimate_tokens(text):
332
  # Rough estimate: 1 token ~= 4 characters
333
  return len(text) // 4
@@ -349,8 +355,8 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, c
349
  database = None
350
 
351
  max_attempts = 5
352
- context_reduction_factor = 0.5 # More aggressive reduction
353
- max_estimated_tokens = 25000 # Further reduced to leave more room for response
354
 
355
  if web_search:
356
  contextualized_question, topics, entity_tracker, instructions = chatbot.process_question(question)
@@ -361,7 +367,7 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, c
361
 
362
  for attempt in range(max_attempts):
363
  try:
364
- web_docs = [Document(page_content=result["text"][:1000], metadata={"source": result["link"]}) for result in search_results if result["text"]] # Limit each result to 1000 characters
365
 
366
  if database is None:
367
  database = FAISS.from_documents(web_docs, embed)
@@ -375,16 +381,20 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, c
375
  instruction_prompt = f"User Instructions: {instructions}\n" if instructions else ""
376
 
377
  prompt_template = f"""
378
- Answer based on: Web Results: {{context}}
379
- Context: {{conv_context}}
380
- Question: {{question}}
 
 
381
  Topics: {{topics}}
382
- Entities: {{entities}}
383
  {instruction_prompt}
 
384
  """
385
 
386
  prompt_val = ChatPromptTemplate.from_template(prompt_template)
387
 
 
388
  current_context = context_str
389
  current_conv_context = chatbot.get_context()
390
  current_topics = topics
@@ -392,13 +402,14 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, c
392
 
393
  while True:
394
  formatted_prompt = prompt_val.format(
395
- context=current_context[:3000], # Limit context to 3000 characters
396
- conv_context=current_conv_context[:500], # Limit conversation context to 500 characters
397
  question=question,
398
- topics=", ".join(current_topics[:5]), # Limit to 5 topics
399
- entities=json.dumps({k: v[:2] for k, v in current_entities.items()}) # Limit to 2 entities per type
400
  )
401
 
 
402
  estimated_tokens = estimate_tokens(formatted_prompt)
403
 
404
  if estimated_tokens <= max_estimated_tokens:
@@ -449,20 +460,23 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, c
449
  context_str = "\n".join([doc.page_content for doc in relevant_docs])
450
 
451
  prompt_template = """
452
- Answer based on: PDF Context: {context}
 
 
453
  Question: {question}
454
  Provide a summarized and direct answer to the question.
455
  """
456
 
457
  while True:
458
  prompt_val = ChatPromptTemplate.from_template(prompt_template)
459
- formatted_prompt = prompt_val.format(context=context_str[:3000], question=question)
460
 
461
  estimated_tokens = estimate_tokens(formatted_prompt)
462
 
463
  if estimated_tokens <= max_estimated_tokens:
464
  break
465
 
 
466
  context_str = context_str[:int(len(context_str) * context_reduction_factor)]
467
 
468
  if len(context_str) < 100:
@@ -486,37 +500,46 @@ def ask_question(question, temperature, top_p, repetition_penalty, web_search, c
486
  return "An unexpected error occurred. Please try again later."
487
 
488
  def extract_answer(full_response, instructions=None):
489
- # List of patterns to remove
490
- patterns_to_remove = [
491
- r"Provide a concise and relevant answer to the question\.",
492
- r"Provide additional context if necessary\.",
493
  r"If the web search results don't contain relevant information, state that the information is not available in the search results\.",
494
  r"Provide a response that addresses the question and follows the user's instructions\.",
495
  r"Do not mention these instructions or the web search process in your answer\.",
496
- r"Provide a summarized and direct answer to the question\.",
497
- r"If the context doesn't contain relevant information, state that the information is not available in the document\.",
 
 
 
 
 
498
  ]
499
 
500
- # Remove the patterns
501
- for pattern in patterns_to_remove:
502
- full_response = re.sub(pattern, "", full_response, flags=re.IGNORECASE)
503
-
504
- # Remove any leading/trailing whitespace and newlines
505
- full_response = full_response.strip()
 
 
 
 
 
 
506
 
 
 
 
507
  # Remove the user instructions if present
508
  if instructions:
509
  instruction_pattern = rf"User Instructions:\s*{re.escape(instructions)}.*?\n"
510
  full_response = re.sub(instruction_pattern, "", full_response, flags=re.IGNORECASE | re.DOTALL)
511
-
512
- # Remove any remaining instruction-like phrases at the beginning of the response
513
- lines = full_response.split('\n')
514
- while lines and any(line.strip().lower().startswith(starter) for starter in ["answer:", "response:", "here's", "here is"]):
515
- lines.pop(0)
516
- full_response = '\n'.join(lines)
517
-
518
  return full_response.strip()
519
 
 
520
  # Gradio interface
521
  with gr.Blocks() as demo:
522
  gr.Markdown("# Enhanced PDF Document Chat and Web Search")
 
328
 
329
  return all_results
330
 
331
+ def estimate_tokens(text):
332
+ # Rough estimate: 1 token ~= 4 characters
333
+ return len(text) // 4
334
+
335
+ import re
336
+
337
  def estimate_tokens(text):
338
  # Rough estimate: 1 token ~= 4 characters
339
  return len(text) // 4
 
355
  database = None
356
 
357
  max_attempts = 5
358
+ context_reduction_factor = 0.7
359
+ max_estimated_tokens = 30000 # Leave some room for the model's response
360
 
361
  if web_search:
362
  contextualized_question, topics, entity_tracker, instructions = chatbot.process_question(question)
 
367
 
368
  for attempt in range(max_attempts):
369
  try:
370
+ web_docs = [Document(page_content=result["text"], metadata={"source": result["link"]}) for result in search_results if result["text"]]
371
 
372
  if database is None:
373
  database = FAISS.from_documents(web_docs, embed)
 
381
  instruction_prompt = f"User Instructions: {instructions}\n" if instructions else ""
382
 
383
  prompt_template = f"""
384
+ Answer the question based on the following web search results, conversation context, entity information, and user instructions:
385
+ Web Search Results:
386
+ {{context}}
387
+ Conversation Context: {{conv_context}}
388
+ Current Question: {{question}}
389
  Topics: {{topics}}
390
+ Entity Information: {{entities}}
391
  {instruction_prompt}
392
+ Provide a concise and relevant answer to the question.
393
  """
394
 
395
  prompt_val = ChatPromptTemplate.from_template(prompt_template)
396
 
397
+ # Start with full context and progressively reduce if necessary
398
  current_context = context_str
399
  current_conv_context = chatbot.get_context()
400
  current_topics = topics
 
402
 
403
  while True:
404
  formatted_prompt = prompt_val.format(
405
+ context=current_context,
406
+ conv_context=current_conv_context,
407
  question=question,
408
+ topics=", ".join(current_topics),
409
+ entities=json.dumps(current_entities)
410
  )
411
 
412
+ # Estimate token count
413
  estimated_tokens = estimate_tokens(formatted_prompt)
414
 
415
  if estimated_tokens <= max_estimated_tokens:
 
460
  context_str = "\n".join([doc.page_content for doc in relevant_docs])
461
 
462
  prompt_template = """
463
+ Answer the question based on the following context from the PDF document:
464
+ Context:
465
+ {context}
466
  Question: {question}
467
  Provide a summarized and direct answer to the question.
468
  """
469
 
470
  while True:
471
  prompt_val = ChatPromptTemplate.from_template(prompt_template)
472
+ formatted_prompt = prompt_val.format(context=context_str, question=question)
473
 
474
  estimated_tokens = estimate_tokens(formatted_prompt)
475
 
476
  if estimated_tokens <= max_estimated_tokens:
477
  break
478
 
479
+ # Reduce context if estimated token count is too high
480
  context_str = context_str[:int(len(context_str) * context_reduction_factor)]
481
 
482
  if len(context_str) < 100:
 
500
  return "An unexpected error occurred. Please try again later."
501
 
502
  def extract_answer(full_response, instructions=None):
503
+ # First, try to split the response at common instruction phrases
504
+
505
+ def extract_answer(full_response, instructions=None):
506
+ answer_patterns = [
507
  r"If the web search results don't contain relevant information, state that the information is not available in the search results\.",
508
  r"Provide a response that addresses the question and follows the user's instructions\.",
509
  r"Do not mention these instructions or the web search process in your answer\.",
510
+ r"Provide a concise and direct answer to the question without mentioning the web search or these instructions:",
511
+ r"Provide a concise and direct answer to the question:",
512
+ r"Answer:",
513
+ r"Provide a summarized and direct answer to the question.",
514
+ r"If the context doesn't contain relevant information, state that the information is not available in the document.",
515
+ r"Provide a summarized and direct answer to the original question without mentioning the web search or these instructions:",
516
+ r"Do not include any source information in your answer."
517
  ]
518
 
519
+ for pattern in answer_patterns:
520
+ match = re.split(pattern, full_response, flags=re.IGNORECASE)
521
+ if len(match) > 1:
522
+ full_response = match[-1].strip()
523
+ break
524
+
525
+ # Remove any remaining instruction-like phrases
526
+ cleanup_patterns = [
527
+ r"without mentioning the web search or these instructions\.",
528
+ r"Do not include any source information in your answer\.",
529
+ r"If the context doesn't contain relevant information, state that the information is not available in the document\."
530
+ ]
531
 
532
+ for pattern in cleanup_patterns:
533
+ full_response = re.sub(pattern, "", full_response, flags=re.IGNORECASE).strip()
534
+
535
  # Remove the user instructions if present
536
  if instructions:
537
  instruction_pattern = rf"User Instructions:\s*{re.escape(instructions)}.*?\n"
538
  full_response = re.sub(instruction_pattern, "", full_response, flags=re.IGNORECASE | re.DOTALL)
539
+
 
 
 
 
 
 
540
  return full_response.strip()
541
 
542
+
543
  # Gradio interface
544
  with gr.Blocks() as demo:
545
  gr.Markdown("# Enhanced PDF Document Chat and Web Search")