Shreyas094 commited on
Commit
41afb33
·
verified ·
1 Parent(s): 9fb0197

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -310
app.py CHANGED
@@ -39,8 +39,6 @@ from typing import List, Dict, Tuple
39
  import datetime
40
  from abc import ABC, abstractmethod
41
  from typing import List, Dict, Any
42
- import spacy
43
- from textblob import TextBlob
44
 
45
  # Automatically get the current year
46
  CURRENT_YEAR = datetime.datetime.now().year
@@ -86,7 +84,7 @@ custom_models = fetch_custom_models()
86
  all_models = ["huggingface", "groq", "mistral"] + custom_models
87
 
88
  # Determine the default model
89
- default_model = CUSTOM_LLM_DEFAULT_MODEL if CUSTOM_LLM_DEFAULT_MODEL in all_models else "mistral"
90
 
91
  logger.info(f"Default model selected: {default_model}")
92
 
@@ -536,212 +534,75 @@ def prepare_documents_for_bm25(documents: List[Dict]) -> Tuple[List[str], List[D
536
  doc_texts.append(doc_text)
537
  return doc_texts, documents
538
 
539
-
540
- class ImprovedRanking:
541
- def __init__(self):
542
- # Load spacy for text analysis
543
- self.nlp = spacy.load('en_core_web_sm')
544
-
545
- def analyze_query(self, query: str) -> Dict:
546
- """
547
- Analyze query to determine appropriate weights
548
-
549
- Args:
550
- query: Search query string
551
-
552
- Returns:
553
- Dictionary with query analysis results
554
- """
555
- doc = self.nlp(query)
556
-
557
- analysis = {
558
- 'word_count': len(query.split()),
559
- 'has_entities': bool(doc.ents),
560
- 'is_question': any(token.tag_ == 'WP' or token.tag_ == 'WRB' for token in doc),
561
- 'sentiment': TextBlob(query).sentiment.polarity
562
- }
563
-
564
- return analysis
565
-
566
- def get_adaptive_weights(self, query: str) -> Tuple[float, float]:
567
- """
568
- Calculate adaptive weights based on query characteristics
569
-
570
- Args:
571
- query: Search query string
572
-
573
- Returns:
574
- Tuple of (bm25_weight, semantic_weight)
575
- """
576
- analysis = self.analyze_query(query)
577
-
578
- # Base weights
579
- bm25_weight = 0.4
580
- semantic_weight = 0.6
581
-
582
- # Adjust weights based on query characteristics
583
- if analysis['word_count'] <= 2:
584
- # Short queries: favor keyword matching
585
- bm25_weight = 0.6
586
- semantic_weight = 0.4
587
- elif analysis['word_count'] >= 6:
588
- # Long queries: favor semantic understanding
589
- bm25_weight = 0.3
590
- semantic_weight = 0.7
591
-
592
- if analysis['has_entities']:
593
- # Queries with named entities: increase keyword importance
594
- bm25_weight += 0.1
595
- semantic_weight -= 0.1
596
-
597
- if analysis['is_question']:
598
- # Questions: favor semantic understanding
599
- bm25_weight -= 0.1
600
- semantic_weight += 0.1
601
-
602
- # Normalize weights to ensure they sum to 1
603
- total = bm25_weight + semantic_weight
604
- return bm25_weight/total, semantic_weight/total
605
-
606
- def calculate_relevance_score(self, doc: Dict, query: str, similarity_model) -> float:
607
- """
608
- Calculate comprehensive relevance score for a document
609
-
610
- Args:
611
- doc: Document dictionary with title and content
612
- query: Search query string
613
- similarity_model: Model for computing semantic similarity
614
-
615
- Returns:
616
- Float representing document relevance score
617
- """
618
- # 1. Title relevance (30%)
619
- title_embedding = similarity_model.encode(doc['title'], convert_to_tensor=True)
620
- query_embedding = similarity_model.encode(query, convert_to_tensor=True)
621
- title_similarity = torch.cosine_similarity(title_embedding, query_embedding, dim=0).item()
622
-
623
- # 2. Content relevance (40%)
624
- # Use first 512 tokens of content to avoid memory issues
625
- content_preview = ' '.join(doc['content'].split()[:512])
626
- content_embedding = similarity_model.encode(content_preview, convert_to_tensor=True)
627
- content_similarity = torch.cosine_similarity(content_embedding, query_embedding, dim=0).item()
628
-
629
- # 3. Query term presence (20%)
630
- query_terms = set(query.lower().split())
631
- title_terms = set(doc['title'].lower().split())
632
- content_terms = set(content_preview.lower().split())
633
-
634
- title_term_overlap = len(query_terms & title_terms) / len(query_terms)
635
- content_term_overlap = len(query_terms & content_terms) / len(query_terms)
636
-
637
- # 4. Document quality indicators (10%)
638
- quality_score = self.assess_document_quality(doc)
639
-
640
- # Combine scores with weights
641
- final_score = (
642
- title_similarity * 0.3 +
643
- content_similarity * 0.4 +
644
- ((title_term_overlap + content_term_overlap) / 2) * 0.2 +
645
- quality_score * 0.1
646
- )
647
-
648
- return final_score
649
-
650
- def assess_document_quality(self, doc: Dict) -> float:
651
- """
652
- Assess document quality based on various metrics
653
-
654
- Args:
655
- doc: Document dictionary
656
-
657
- Returns:
658
- Float representing document quality score
659
- """
660
- score = 0.0
661
-
662
- # 1. Length score (longer documents often have more information)
663
- content_length = len(doc['content'].split())
664
- length_score = min(content_length / 1000, 1.0) # Cap at 1000 words
665
-
666
- # 2. Text structure score
667
- has_paragraphs = doc['content'].count('\n\n') > 0
668
- has_sections = bool(re.findall(r'\n[A-Z][^.!?]*[:]\n', doc['content']))
669
-
670
- # 3. Writing quality score (using basic metrics)
671
- blob = TextBlob(doc['content'])
672
- sentences = blob.sentences
673
- avg_sentence_length = sum(len(str(s).split()) for s in sentences) / len(sentences) if sentences else 0
674
- sentence_score = 1.0 if 10 <= avg_sentence_length <= 25 else 0.5
675
-
676
- # Combine quality metrics
677
- score = (
678
- length_score * 0.4 +
679
- (has_paragraphs * 0.2 + has_sections * 0.2) +
680
- sentence_score * 0.2
681
- )
682
-
683
- return score
684
-
685
  # Now modify the rerank_documents_with_priority function to include BM25 ranking
686
- def rerank_documents_improved(query: str, documents: List[Dict],
687
- similarity_model, max_results: int = 5) -> List[Dict]:
688
- """
689
- Rerank documents using improved scoring system
690
-
691
- Args:
692
- query: Search query string
693
- documents: List of document dictionaries
694
- similarity_model: Model for computing semantic similarity
695
- max_results: Maximum number of results to return
696
-
697
- Returns:
698
- List of reranked documents
699
- """
700
- ranker = ImprovedRanking()
701
-
702
  try:
703
  if not documents:
 
704
  return documents
705
 
706
- # Get adaptive weights based on query
707
- bm25_weight, semantic_weight = ranker.get_adaptive_weights(query)
708
-
709
- # Prepare documents for BM25
710
  doc_texts, original_docs = prepare_documents_for_bm25(documents)
711
 
712
- # Initialize and fit BM25
713
  bm25 = BM25()
714
  bm25.fit(doc_texts)
715
 
716
- # Get BM25 scores
717
  bm25_scores = bm25.get_scores(query)
718
 
719
- # Calculate comprehensive relevance scores
720
- relevance_scores = [
721
- ranker.calculate_relevance_score(doc, query, similarity_model)
722
- for doc in documents
723
- ]
724
 
725
- # Normalize scores
726
  bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
727
- relevance_scores_norm = (np.array(relevance_scores) - np.min(relevance_scores)) / (np.max(relevance_scores) - np.min(relevance_scores))
728
 
729
- # Combine scores using adaptive weights
730
- final_scores = (bm25_weight * bm25_scores_norm +
731
- semantic_weight * relevance_scores_norm)
732
 
733
- # Create scored documents
734
- scored_documents = list(zip(documents, final_scores))
735
 
736
- # Sort by final score
737
  scored_documents.sort(key=lambda x: x[1], reverse=True)
738
 
739
- # Return top results
740
- return [doc for doc, score in scored_documents[:max_results]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
 
742
  except Exception as e:
743
- logger.error(f"Error during improved reranking: {e}")
744
- return documents[:max_results]
745
 
746
  def compute_similarity(text1, text2):
747
  # Encode the texts
@@ -917,9 +778,6 @@ def search_and_scrape(
917
  use_pydf2: bool = True
918
  ):
919
  try:
920
- # Initialize ImprovedRanking instead of DocumentRanker
921
- document_ranker = ImprovedRanking()
922
-
923
  # Step 1: Rephrase the Query
924
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
925
  logger.info(f"Rephrased Query: {rephrased_query}")
@@ -928,7 +786,8 @@ def search_and_scrape(
928
  logger.info("No need to perform search based on the rephrased query.")
929
  return "No search needed for the provided input."
930
 
931
- # [Search parameters and request handling remain the same...]
 
932
  params = {
933
  'q': rephrased_query,
934
  'format': 'json',
@@ -941,11 +800,13 @@ def search_and_scrape(
941
 
942
  # Remove empty parameters
943
  params = {k: v for k, v in params.items() if v != ""}
944
-
 
945
  if 'engines' not in params:
946
- params['engines'] = 'google'
947
  logger.info("No engines specified. Defaulting to 'google'.")
948
 
 
949
  headers = {
950
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
951
  'Accept': 'application/json, text/javascript, */*; q=0.01',
@@ -961,16 +822,18 @@ def search_and_scrape(
961
 
962
  scraped_content = []
963
  page = 1
964
-
965
- # Content scraping loop remains mostly the same, but add quality assessment
966
  while len(scraped_content) < num_results:
 
967
  params['pageno'] = page
968
-
 
 
 
 
969
  try:
970
- session = requests_retry_session()
971
  if method.upper() == "GET":
972
  response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
973
- else:
974
  response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
975
 
976
  response.raise_for_status()
@@ -979,8 +842,9 @@ def search_and_scrape(
979
  return f"An error occurred during the search request: {e}"
980
 
981
  search_results = response.json()
 
 
982
  results = search_results.get('results', [])
983
-
984
  if not results:
985
  logger.warning(f"No more results returned from SearXNG on page {page}.")
986
  break
@@ -988,40 +852,33 @@ def search_and_scrape(
988
  for result in results:
989
  if len(scraped_content) >= num_results:
990
  break
991
-
992
  url = result.get('url', '')
993
  title = result.get('title', 'No title')
994
-
995
  if not is_valid_url(url):
996
  logger.warning(f"Invalid URL: {url}")
997
  continue
998
-
999
  try:
1000
  logger.info(f"Processing content from: {url}")
 
1001
  content = scrape_full_content(url, max_chars, timeout, use_pydf2)
1002
 
1003
- if content is None:
1004
  continue
1005
 
1006
  if not content:
1007
  logger.warning(f"Failed to scrape content from {url}")
1008
  continue
1009
 
1010
- # Add initial quality assessment
1011
- doc_quality = document_ranker.assess_document_quality({
1012
- "title": title,
1013
- "content": content
1014
- })
1015
-
1016
  scraped_content.append({
1017
  "title": title,
1018
  "url": url,
1019
  "content": content,
1020
- "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper",
1021
- "quality_score": doc_quality
1022
  })
1023
- logger.info(f"Successfully scraped content from {url}. Quality score: {doc_quality}")
1024
-
1025
  except requests.exceptions.RequestException as e:
1026
  logger.error(f"Error scraping {url}: {e}")
1027
  except Exception as e:
@@ -1033,108 +890,48 @@ def search_and_scrape(
1033
  logger.warning("No content scraped from search results.")
1034
  return "No content could be scraped from the search results."
1035
 
1036
- # Modified relevance assessment with improved analysis
 
 
1037
  relevant_documents = []
1038
- unique_summaries = set()
1039
-
1040
  for doc in scraped_content:
1041
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
1042
  relevance, summary = assessment.split('\n', 1)
1043
-
1044
  if relevance.strip().lower() == "relevant: yes":
1045
  summary_text = summary.replace("Summary: ", "").strip()
1046
 
1047
- if is_content_unique(summary_text, unique_summaries, similarity_threshold=0.8):
1048
- # Calculate comprehensive relevance score using new method
1049
- relevance_score = document_ranker.calculate_relevance_score(
1050
- {
1051
- "title": doc['title'],
1052
- "content": doc['content'],
1053
- "summary": summary_text
1054
- },
1055
- rephrased_query,
1056
- similarity_model
1057
- )
1058
-
1059
  relevant_documents.append({
1060
  "title": doc['title'],
1061
  "url": doc['url'],
1062
- "content": doc['content'],
1063
  "summary": summary_text,
1064
- "scraper": doc['scraper'],
1065
- "relevance_score": relevance_score,
1066
- "quality_score": doc['quality_score']
1067
  })
1068
- unique_summaries.add(summary_text)
 
 
1069
 
1070
  if not relevant_documents:
1071
  logger.warning("No relevant and unique documents found.")
1072
- return "No relevant and unique content found for the given query."
1073
-
1074
- # Enhanced reranking using improved weights and BM25
1075
- try:
1076
- # Get query-adaptive weights
1077
- bm25_weight, semantic_weight = document_ranker.get_adaptive_weights(rephrased_query)
1078
- logger.info(f"Using adaptive weights - BM25: {bm25_weight}, Semantic: {semantic_weight}")
1079
-
1080
- # Prepare documents for BM25
1081
- doc_texts = [f"{doc['title']} {doc['content']}" for doc in relevant_documents]
1082
-
1083
- # Initialize and fit BM25
1084
- bm25 = BM25()
1085
- bm25.fit(doc_texts)
1086
-
1087
- # Get BM25 scores
1088
- bm25_scores = bm25.get_scores(rephrased_query)
1089
-
1090
- # Calculate semantic scores using title and content
1091
- query_embedding = similarity_model.encode(rephrased_query, convert_to_tensor=True)
1092
- doc_embeddings = similarity_model.encode(
1093
- [f"{doc['title']} {doc['summary']}" for doc in relevant_documents],
1094
- convert_to_tensor=True
1095
- )
1096
- semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
1097
-
1098
- # Get quality scores
1099
- quality_scores = np.array([doc['quality_score'] for doc in relevant_documents])
1100
-
1101
- # Normalize all scores
1102
- bm25_scores_norm = normalize_scores(bm25_scores)
1103
- semantic_scores_norm = normalize_scores(semantic_scores.numpy())
1104
- quality_scores_norm = normalize_scores(quality_scores)
1105
- relevance_scores = normalize_scores(
1106
- np.array([doc['relevance_score'] for doc in relevant_documents])
1107
- )
1108
-
1109
- # Combine scores with weights
1110
- final_scores = (
1111
- bm25_weight * bm25_scores_norm +
1112
- semantic_weight * semantic_scores_norm +
1113
- 0.15 * quality_scores_norm + # Add quality score weight
1114
- 0.15 * relevance_scores # Reduced from 0.2 to accommodate quality
1115
- )
1116
-
1117
- # Create scored documents
1118
- scored_documents = list(zip(relevant_documents, final_scores))
1119
- scored_documents.sort(key=lambda x: x[1], reverse=True)
1120
-
1121
- # Take top results
1122
- reranked_docs = [doc for doc, _ in scored_documents[:num_results]]
1123
-
1124
- except Exception as e:
1125
- logger.error(f"Error during document reranking: {e}")
1126
- # Fallback to basic sorting by relevance and quality
1127
- reranked_docs = sorted(
1128
- relevant_documents,
1129
- key=lambda x: (x['relevance_score'] + x['quality_score']) / 2,
1130
- reverse=True
1131
- )[:num_results]
1132
 
 
 
 
1133
  if not reranked_docs:
1134
  logger.warning("No documents remained after reranking.")
1135
- return "No relevant content found after filtering and ranking."
 
 
1136
 
1137
- # Prepare final documents for LLM
 
 
 
 
 
1138
  llm_input = {
1139
  "query": query,
1140
  "documents": [
@@ -1142,13 +939,12 @@ def search_and_scrape(
1142
  "title": doc['title'],
1143
  "url": doc['url'],
1144
  "summary": doc['summary'],
1145
- "content": doc['content'],
1146
- "quality_score": doc['quality_score'] # Include quality score
1147
- } for doc in reranked_docs
1148
  ]
1149
  }
1150
 
1151
- # LLM Summarization
1152
  llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
1153
 
1154
  return llm_summary
@@ -1157,12 +953,6 @@ def search_and_scrape(
1157
  logger.error(f"Unexpected error in search_and_scrape: {e}")
1158
  return f"An unexpected error occurred during the search and scrape process: {e}"
1159
 
1160
- def normalize_scores(scores: np.ndarray) -> np.ndarray:
1161
- """Normalize scores to range [0, 1]"""
1162
- if np.all(scores == scores[0]):
1163
- return np.ones_like(scores)
1164
- return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
1165
-
1166
  # Helper function to get the appropriate client for each model
1167
  def get_client_for_model(model: str) -> Any:
1168
  if model == "huggingface":
@@ -1218,7 +1008,7 @@ iface = gr.ChatInterface(
1218
  description="Ask Sentinel any question. It will search the web for recent information or use its knowledge base as appropriate.",
1219
  theme=gr.Theme.from_hub("allenai/gradio-theme"),
1220
  additional_inputs=[
1221
- gr.Checkbox(label="Only do web search", value=False), # Add this line
1222
  gr.Slider(5, 20, value=3, step=1, label="Number of initial results"),
1223
  gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
1224
  gr.Dropdown(["", "day", "week", "month", "year"], value="week", label="Time Range"),
@@ -1231,7 +1021,7 @@ iface = gr.ChatInterface(
1231
  label="Engines"
1232
  ),
1233
  gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
1234
- gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
1235
  gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
1236
  gr.Dropdown(all_models, value=default_model, label="LLM Model"),
1237
  gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=True),
@@ -1250,4 +1040,4 @@ iface = gr.ChatInterface(
1250
 
1251
  if __name__ == "__main__":
1252
  logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
1253
- iface.launch(share=False)
 
39
  import datetime
40
  from abc import ABC, abstractmethod
41
  from typing import List, Dict, Any
 
 
42
 
43
  # Automatically get the current year
44
  CURRENT_YEAR = datetime.datetime.now().year
 
84
  all_models = ["huggingface", "groq", "mistral"] + custom_models
85
 
86
  # Determine the default model
87
+ default_model = CUSTOM_LLM_DEFAULT_MODEL if CUSTOM_LLM_DEFAULT_MODEL in all_models else "groq"
88
 
89
  logger.info(f"Default model selected: {default_model}")
90
 
 
534
  doc_texts.append(doc_text)
535
  return doc_texts, documents
536
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
  # Now modify the rerank_documents_with_priority function to include BM25 ranking
538
+ def rerank_documents(query: str, documents: List[Dict],
539
+ similarity_threshold: float = 0.95, max_results: int = 5) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
  try:
541
  if not documents:
542
+ logger.warning("No documents to rerank.")
543
  return documents
544
 
545
+ # Step 1: Prepare documents for BM25
 
 
 
546
  doc_texts, original_docs = prepare_documents_for_bm25(documents)
547
 
548
+ # Step 2: Initialize and fit BM25
549
  bm25 = BM25()
550
  bm25.fit(doc_texts)
551
 
552
+ # Step 3: Get BM25 scores
553
  bm25_scores = bm25.get_scores(query)
554
 
555
+ # Step 4: Get semantic similarity scores
556
+ query_embedding = similarity_model.encode(query, convert_to_tensor=True)
557
+ doc_summaries = [doc['summary'] for doc in documents]
558
+ doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
559
+ semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
560
 
561
+ # Step 5: Combine scores (normalize first)
562
  bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
563
+ semantic_scores_norm = (semantic_scores - torch.min(semantic_scores)) / (torch.max(semantic_scores) - torch.min(semantic_scores))
564
 
565
+ # Combine scores with weights (0.4 for BM25, 0.6 for semantic similarity)
566
+ combined_scores = 0.4 * bm25_scores_norm + 0.6 * semantic_scores_norm.numpy()
 
567
 
568
+ # Create scored documents with combined scores
569
+ scored_documents = list(zip(documents, combined_scores))
570
 
571
+ # Sort by combined score (descending)
572
  scored_documents.sort(key=lambda x: x[1], reverse=True)
573
 
574
+ # Filter similar documents
575
+ filtered_docs = []
576
+ added_contents = []
577
+
578
+ for doc, score in scored_documents:
579
+ if score < 0.3: # Minimum relevance threshold
580
+ continue
581
+
582
+ # Check similarity with already selected documents
583
+ doc_embedding = similarity_model.encode(doc['summary'], convert_to_tensor=True)
584
+ is_similar = False
585
+
586
+ for content in added_contents:
587
+ content_embedding = similarity_model.encode(content, convert_to_tensor=True)
588
+ similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
589
+ if similarity > similarity_threshold:
590
+ is_similar = True
591
+ break
592
+
593
+ if not is_similar:
594
+ filtered_docs.append(doc)
595
+ added_contents.append(doc['summary'])
596
+
597
+ if len(filtered_docs) >= max_results:
598
+ break
599
+
600
+ logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents using BM25 and semantic similarity.")
601
+ return filtered_docs
602
 
603
  except Exception as e:
604
+ logger.error(f"Error during reranking documents: {e}")
605
+ return documents[:max_results] # Fallback to first max_results documents if reranking fails
606
 
607
  def compute_similarity(text1, text2):
608
  # Encode the texts
 
778
  use_pydf2: bool = True
779
  ):
780
  try:
 
 
 
781
  # Step 1: Rephrase the Query
782
  rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
783
  logger.info(f"Rephrased Query: {rephrased_query}")
 
786
  logger.info("No need to perform search based on the rephrased query.")
787
  return "No search needed for the provided input."
788
 
789
+ # Step 2: Perform search
790
+ # Search query parameters
791
  params = {
792
  'q': rephrased_query,
793
  'format': 'json',
 
800
 
801
  # Remove empty parameters
802
  params = {k: v for k, v in params.items() if v != ""}
803
+
804
+ # If no engines are specified, set default engines
805
  if 'engines' not in params:
806
+ params['engines'] = 'google' # Default to 'google' or any preferred engine
807
  logger.info("No engines specified. Defaulting to 'google'.")
808
 
809
+ # Headers for SearXNG request
810
  headers = {
811
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
812
  'Accept': 'application/json, text/javascript, */*; q=0.01',
 
822
 
823
  scraped_content = []
824
  page = 1
 
 
825
  while len(scraped_content) < num_results:
826
+ # Update params with current page
827
  params['pageno'] = page
828
+
829
+ # Send request to SearXNG
830
+ logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
831
+ session = requests_retry_session()
832
+
833
  try:
 
834
  if method.upper() == "GET":
835
  response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
836
+ else: # POST
837
  response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
838
 
839
  response.raise_for_status()
 
842
  return f"An error occurred during the search request: {e}"
843
 
844
  search_results = response.json()
845
+ logger.debug(f"SearXNG Response: {search_results}")
846
+
847
  results = search_results.get('results', [])
 
848
  if not results:
849
  logger.warning(f"No more results returned from SearXNG on page {page}.")
850
  break
 
852
  for result in results:
853
  if len(scraped_content) >= num_results:
854
  break
855
+
856
  url = result.get('url', '')
857
  title = result.get('title', 'No title')
858
+
859
  if not is_valid_url(url):
860
  logger.warning(f"Invalid URL: {url}")
861
  continue
862
+
863
  try:
864
  logger.info(f"Processing content from: {url}")
865
+
866
  content = scrape_full_content(url, max_chars, timeout, use_pydf2)
867
 
868
+ if content is None: # This means it's a PDF and use_pydf2 is False
869
  continue
870
 
871
  if not content:
872
  logger.warning(f"Failed to scrape content from {url}")
873
  continue
874
 
 
 
 
 
 
 
875
  scraped_content.append({
876
  "title": title,
877
  "url": url,
878
  "content": content,
879
+ "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper"
 
880
  })
881
+ logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
 
882
  except requests.exceptions.RequestException as e:
883
  logger.error(f"Error scraping {url}: {e}")
884
  except Exception as e:
 
890
  logger.warning("No content scraped from search results.")
891
  return "No content could be scraped from the search results."
892
 
893
+ logger.info(f"Successfully scraped {len(scraped_content)} documents.")
894
+
895
+ # Step 4: Assess relevance, summarize, and check for uniqueness
896
  relevant_documents = []
897
+ unique_summaries = []
 
898
  for doc in scraped_content:
899
  assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
900
  relevance, summary = assessment.split('\n', 1)
901
+
902
  if relevance.strip().lower() == "relevant: yes":
903
  summary_text = summary.replace("Summary: ", "").strip()
904
 
905
+ if is_content_unique(summary_text, unique_summaries):
 
 
 
 
 
 
 
 
 
 
 
906
  relevant_documents.append({
907
  "title": doc['title'],
908
  "url": doc['url'],
 
909
  "summary": summary_text,
910
+ "scraper": doc['scraper']
 
 
911
  })
912
+ unique_summaries.append(summary_text)
913
+ else:
914
+ logger.info(f"Skipping similar content: {doc['title']}")
915
 
916
  if not relevant_documents:
917
  logger.warning("No relevant and unique documents found.")
918
+ return "No relevant and unique news found for the given query."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
919
 
920
+ # Step 5: Rerank documents based on similarity to query
921
+ reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
922
+
923
  if not reranked_docs:
924
  logger.warning("No documents remained after reranking.")
925
+ return "No relevant news found after filtering and ranking."
926
+
927
+ logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, related documents.")
928
 
929
+ # Step 5: Scrape full content for top documents (up to num_results)
930
+ for doc in reranked_docs[:num_results]:
931
+ full_content = scrape_full_content(doc['url'], max_chars)
932
+ doc['full_content'] = full_content
933
+
934
+ # Prepare JSON for LLM
935
  llm_input = {
936
  "query": query,
937
  "documents": [
 
939
  "title": doc['title'],
940
  "url": doc['url'],
941
  "summary": doc['summary'],
942
+ "full_content": doc['full_content']
943
+ } for doc in reranked_docs[:num_results]
 
944
  ]
945
  }
946
 
947
+ # Step 6: LLM Summarization
948
  llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
949
 
950
  return llm_summary
 
953
  logger.error(f"Unexpected error in search_and_scrape: {e}")
954
  return f"An unexpected error occurred during the search and scrape process: {e}"
955
 
 
 
 
 
 
 
956
  # Helper function to get the appropriate client for each model
957
  def get_client_for_model(model: str) -> Any:
958
  if model == "huggingface":
 
1008
  description="Ask Sentinel any question. It will search the web for recent information or use its knowledge base as appropriate.",
1009
  theme=gr.Theme.from_hub("allenai/gradio-theme"),
1010
  additional_inputs=[
1011
+ gr.Checkbox(label="Only do web search", value=True), # Add this line
1012
  gr.Slider(5, 20, value=3, step=1, label="Number of initial results"),
1013
  gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
1014
  gr.Dropdown(["", "day", "week", "month", "year"], value="week", label="Time Range"),
 
1021
  label="Engines"
1022
  ),
1023
  gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
1024
+ gr.Radio(["GET", "POST"], value="GET", label="HTTP Method"),
1025
  gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
1026
  gr.Dropdown(all_models, value=default_model, label="LLM Model"),
1027
  gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=True),
 
1040
 
1041
  if __name__ == "__main__":
1042
  logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
1043
+ iface.launch(share=True)