SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 25, 2024

Commit

41afb33

verified ·

1 Parent(s): 9fb0197

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -310

app.py CHANGED Viewed

@@ -39,8 +39,6 @@ from typing import List, Dict, Tuple
 import datetime
 from abc import ABC, abstractmethod
 from typing import List, Dict, Any
-import spacy
-from textblob import TextBlob
 # Automatically get the current year
 CURRENT_YEAR = datetime.datetime.now().year
@@ -86,7 +84,7 @@ custom_models = fetch_custom_models()
 all_models = ["huggingface", "groq", "mistral"] + custom_models
 # Determine the default model
-default_model = CUSTOM_LLM_DEFAULT_MODEL if CUSTOM_LLM_DEFAULT_MODEL in all_models else "mistral"
 logger.info(f"Default model selected: {default_model}")
@@ -536,212 +534,75 @@ def prepare_documents_for_bm25(documents: List[Dict]) -> Tuple[List[str], List[D
         doc_texts.append(doc_text)
     return doc_texts, documents
-class ImprovedRanking:
-    def __init__(self):
-        # Load spacy for text analysis
-        self.nlp = spacy.load('en_core_web_sm')
-    def analyze_query(self, query: str) -> Dict:
-        """
-        Analyze query to determine appropriate weights
-        Args:
-            query: Search query string
-        Returns:
-            Dictionary with query analysis results
-        """
-        doc = self.nlp(query)
-        analysis = {
-            'word_count': len(query.split()),
-            'has_entities': bool(doc.ents),
-            'is_question': any(token.tag_ == 'WP' or token.tag_ == 'WRB' for token in doc),
-            'sentiment': TextBlob(query).sentiment.polarity
-        }
-        return analysis
-    def get_adaptive_weights(self, query: str) -> Tuple[float, float]:
-        """
-        Calculate adaptive weights based on query characteristics
-        Args:
-            query: Search query string
-        Returns:
-            Tuple of (bm25_weight, semantic_weight)
-        """
-        analysis = self.analyze_query(query)
-        # Base weights
-        bm25_weight = 0.4
-        semantic_weight = 0.6
-        # Adjust weights based on query characteristics
-        if analysis['word_count'] <= 2:
-            # Short queries: favor keyword matching
-            bm25_weight = 0.6
-            semantic_weight = 0.4
-        elif analysis['word_count'] >= 6:
-            # Long queries: favor semantic understanding
-            bm25_weight = 0.3
-            semantic_weight = 0.7
-        if analysis['has_entities']:
-            # Queries with named entities: increase keyword importance
-            bm25_weight += 0.1
-            semantic_weight -= 0.1
-        if analysis['is_question']:
-            # Questions: favor semantic understanding
-            bm25_weight -= 0.1
-            semantic_weight += 0.1
-        # Normalize weights to ensure they sum to 1
-        total = bm25_weight + semantic_weight
-        return bm25_weight/total, semantic_weight/total
-    def calculate_relevance_score(self, doc: Dict, query: str, similarity_model) -> float:
-        """
-        Calculate comprehensive relevance score for a document
-        Args:
-            doc: Document dictionary with title and content
-            query: Search query string
-            similarity_model: Model for computing semantic similarity
-        Returns:
-            Float representing document relevance score
-        """
-        # 1. Title relevance (30%)
-        title_embedding = similarity_model.encode(doc['title'], convert_to_tensor=True)
-        query_embedding = similarity_model.encode(query, convert_to_tensor=True)
-        title_similarity = torch.cosine_similarity(title_embedding, query_embedding, dim=0).item()
-        # 2. Content relevance (40%)
-        # Use first 512 tokens of content to avoid memory issues
-        content_preview = ' '.join(doc['content'].split()[:512])
-        content_embedding = similarity_model.encode(content_preview, convert_to_tensor=True)
-        content_similarity = torch.cosine_similarity(content_embedding, query_embedding, dim=0).item()
-        # 3. Query term presence (20%)
-        query_terms = set(query.lower().split())
-        title_terms = set(doc['title'].lower().split())
-        content_terms = set(content_preview.lower().split())
-        title_term_overlap = len(query_terms & title_terms) / len(query_terms)
-        content_term_overlap = len(query_terms & content_terms) / len(query_terms)
-        # 4. Document quality indicators (10%)
-        quality_score = self.assess_document_quality(doc)
-        # Combine scores with weights
-        final_score = (
-            title_similarity * 0.3 +
-            content_similarity * 0.4 +
-            ((title_term_overlap + content_term_overlap) / 2) * 0.2 +
-            quality_score * 0.1
-        )
-        return final_score
-    def assess_document_quality(self, doc: Dict) -> float:
-        """
-        Assess document quality based on various metrics
-        Args:
-            doc: Document dictionary
-        Returns:
-            Float representing document quality score
-        """
-        score = 0.0
-        # 1. Length score (longer documents often have more information)
-        content_length = len(doc['content'].split())
-        length_score = min(content_length / 1000, 1.0)  # Cap at 1000 words
-        # 2. Text structure score
-        has_paragraphs = doc['content'].count('\n\n') > 0
-        has_sections = bool(re.findall(r'\n[A-Z][^.!?]*[:]\n', doc['content']))
-        # 3. Writing quality score (using basic metrics)
-        blob = TextBlob(doc['content'])
-        sentences = blob.sentences
-        avg_sentence_length = sum(len(str(s).split()) for s in sentences) / len(sentences) if sentences else 0
-        sentence_score = 1.0 if 10 <= avg_sentence_length <= 25 else 0.5
-        # Combine quality metrics
-        score = (
-            length_score * 0.4 +
-            (has_paragraphs * 0.2 + has_sections * 0.2) +
-            sentence_score * 0.2
-        )
-        return score
 # Now modify the rerank_documents_with_priority function to include BM25 ranking
-def rerank_documents_improved(query: str, documents: List[Dict],
-                            similarity_model, max_results: int = 5) -> List[Dict]:
-    """
-    Rerank documents using improved scoring system
-    Args:
-        query: Search query string
-        documents: List of document dictionaries
-        similarity_model: Model for computing semantic similarity
-        max_results: Maximum number of results to return
-    Returns:
-        List of reranked documents
-    """
-    ranker = ImprovedRanking()
     try:
         if not documents:
             return documents
-        # Get adaptive weights based on query
-        bm25_weight, semantic_weight = ranker.get_adaptive_weights(query)
-        # Prepare documents for BM25
         doc_texts, original_docs = prepare_documents_for_bm25(documents)
-        # Initialize and fit BM25
         bm25 = BM25()
         bm25.fit(doc_texts)
-        # Get BM25 scores
         bm25_scores = bm25.get_scores(query)
-        # Calculate comprehensive relevance scores
-        relevance_scores = [
-            ranker.calculate_relevance_score(doc, query, similarity_model)
-            for doc in documents
-        ]
-        # Normalize scores
         bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
-        relevance_scores_norm = (np.array(relevance_scores) - np.min(relevance_scores)) / (np.max(relevance_scores) - np.min(relevance_scores))
-        # Combine scores using adaptive weights
-        final_scores = (bm25_weight * bm25_scores_norm +
-                       semantic_weight * relevance_scores_norm)
-        # Create scored documents
-        scored_documents = list(zip(documents, final_scores))
-        # Sort by final score
         scored_documents.sort(key=lambda x: x[1], reverse=True)
-        # Return top results
-        return [doc for doc, score in scored_documents[:max_results]]
     except Exception as e:
-        logger.error(f"Error during improved reranking: {e}")
-        return documents[:max_results]
 def compute_similarity(text1, text2):
     # Encode the texts
@@ -917,9 +778,6 @@ def search_and_scrape(
     use_pydf2: bool = True
 ):
     try:
-        # Initialize ImprovedRanking instead of DocumentRanker
-        document_ranker = ImprovedRanking()
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
         logger.info(f"Rephrased Query: {rephrased_query}")
@@ -928,7 +786,8 @@ def search_and_scrape(
             logger.info("No need to perform search based on the rephrased query.")
             return "No search needed for the provided input."
-        # [Search parameters and request handling remain the same...]
         params = {
             'q': rephrased_query,
             'format': 'json',
@@ -941,11 +800,13 @@ def search_and_scrape(
         # Remove empty parameters
         params = {k: v for k, v in params.items() if v != ""}
         if 'engines' not in params:
-            params['engines'] = 'google'
             logger.info("No engines specified. Defaulting to 'google'.")
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Accept': 'application/json, text/javascript, */*; q=0.01',
@@ -961,16 +822,18 @@ def search_and_scrape(
         scraped_content = []
         page = 1
-        # Content scraping loop remains mostly the same, but add quality assessment
         while len(scraped_content) < num_results:
             params['pageno'] = page
             try:
-                session = requests_retry_session()
                 if method.upper() == "GET":
                     response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
-                else:
                     response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
                 response.raise_for_status()
@@ -979,8 +842,9 @@ def search_and_scrape(
                 return f"An error occurred during the search request: {e}"
             search_results = response.json()
             results = search_results.get('results', [])
             if not results:
                 logger.warning(f"No more results returned from SearXNG on page {page}.")
                 break
@@ -988,40 +852,33 @@ def search_and_scrape(
             for result in results:
                 if len(scraped_content) >= num_results:
                     break
                 url = result.get('url', '')
                 title = result.get('title', 'No title')
                 if not is_valid_url(url):
                     logger.warning(f"Invalid URL: {url}")
                     continue
                 try:
                     logger.info(f"Processing content from: {url}")
                     content = scrape_full_content(url, max_chars, timeout, use_pydf2)
-                    if content is None:
                         continue
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
                         continue
-                    # Add initial quality assessment
-                    doc_quality = document_ranker.assess_document_quality({
-                        "title": title,
-                        "content": content
-                    })
                     scraped_content.append({
                         "title": title,
                         "url": url,
                         "content": content,
-                        "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper",
-                        "quality_score": doc_quality
                     })
-                    logger.info(f"Successfully scraped content from {url}. Quality score: {doc_quality}")
                 except requests.exceptions.RequestException as e:
                     logger.error(f"Error scraping {url}: {e}")
                 except Exception as e:
@@ -1033,108 +890,48 @@ def search_and_scrape(
             logger.warning("No content scraped from search results.")
             return "No content could be scraped from the search results."
-        # Modified relevance assessment with improved analysis
         relevant_documents = []
-        unique_summaries = set()
         for doc in scraped_content:
             assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
             relevance, summary = assessment.split('\n', 1)
             if relevance.strip().lower() == "relevant: yes":
                 summary_text = summary.replace("Summary: ", "").strip()
-                if is_content_unique(summary_text, unique_summaries, similarity_threshold=0.8):
-                    # Calculate comprehensive relevance score using new method
-                    relevance_score = document_ranker.calculate_relevance_score(
-                        {
-                            "title": doc['title'],
-                            "content": doc['content'],
-                            "summary": summary_text
-                        },
-                        rephrased_query,
-                        similarity_model
-                    )
                     relevant_documents.append({
                         "title": doc['title'],
                         "url": doc['url'],
-                        "content": doc['content'],
                         "summary": summary_text,
-                        "scraper": doc['scraper'],
-                        "relevance_score": relevance_score,
-                        "quality_score": doc['quality_score']
                     })
-                    unique_summaries.add(summary_text)
         if not relevant_documents:
             logger.warning("No relevant and unique documents found.")
-            return "No relevant and unique content found for the given query."
-        # Enhanced reranking using improved weights and BM25
-        try:
-            # Get query-adaptive weights
-            bm25_weight, semantic_weight = document_ranker.get_adaptive_weights(rephrased_query)
-            logger.info(f"Using adaptive weights - BM25: {bm25_weight}, Semantic: {semantic_weight}")
-            # Prepare documents for BM25
-            doc_texts = [f"{doc['title']} {doc['content']}" for doc in relevant_documents]
-            # Initialize and fit BM25
-            bm25 = BM25()
-            bm25.fit(doc_texts)
-            # Get BM25 scores
-            bm25_scores = bm25.get_scores(rephrased_query)
-            # Calculate semantic scores using title and content
-            query_embedding = similarity_model.encode(rephrased_query, convert_to_tensor=True)
-            doc_embeddings = similarity_model.encode(
-                [f"{doc['title']} {doc['summary']}" for doc in relevant_documents],
-                convert_to_tensor=True
-            )
-            semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
-            # Get quality scores
-            quality_scores = np.array([doc['quality_score'] for doc in relevant_documents])
-            # Normalize all scores
-            bm25_scores_norm = normalize_scores(bm25_scores)
-            semantic_scores_norm = normalize_scores(semantic_scores.numpy())
-            quality_scores_norm = normalize_scores(quality_scores)
-            relevance_scores = normalize_scores(
-                np.array([doc['relevance_score'] for doc in relevant_documents])
-            )
-            # Combine scores with weights
-            final_scores = (
-                bm25_weight * bm25_scores_norm +
-                semantic_weight * semantic_scores_norm +
-                0.15 * quality_scores_norm +  # Add quality score weight
-                0.15 * relevance_scores      # Reduced from 0.2 to accommodate quality
-            )
-            # Create scored documents
-            scored_documents = list(zip(relevant_documents, final_scores))
-            scored_documents.sort(key=lambda x: x[1], reverse=True)
-            # Take top results
-            reranked_docs = [doc for doc, _ in scored_documents[:num_results]]
-        except Exception as e:
-            logger.error(f"Error during document reranking: {e}")
-            # Fallback to basic sorting by relevance and quality
-            reranked_docs = sorted(
-                relevant_documents,
-                key=lambda x: (x['relevance_score'] + x['quality_score']) / 2,
-                reverse=True
-            )[:num_results]
         if not reranked_docs:
             logger.warning("No documents remained after reranking.")
-            return "No relevant content found after filtering and ranking."
-        # Prepare final documents for LLM
         llm_input = {
             "query": query,
             "documents": [
@@ -1142,13 +939,12 @@ def search_and_scrape(
                     "title": doc['title'],
                     "url": doc['url'],
                     "summary": doc['summary'],
-                    "content": doc['content'],
-                    "quality_score": doc['quality_score']  # Include quality score
-                } for doc in reranked_docs
             ]
         }
-        # LLM Summarization
         llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
         return llm_summary
@@ -1157,12 +953,6 @@ def search_and_scrape(
         logger.error(f"Unexpected error in search_and_scrape: {e}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
-def normalize_scores(scores: np.ndarray) -> np.ndarray:
-    """Normalize scores to range [0, 1]"""
-    if np.all(scores == scores[0]):
-        return np.ones_like(scores)
-    return (scores - np.min(scores)) / (np.max(scores) - np.min(scores))
 # Helper function to get the appropriate client for each model
 def get_client_for_model(model: str) -> Any:
     if model == "huggingface":
@@ -1218,7 +1008,7 @@ iface = gr.ChatInterface(
     description="Ask Sentinel any question. It will search the web for recent information or use its knowledge base as appropriate.",
     theme=gr.Theme.from_hub("allenai/gradio-theme"),
     additional_inputs=[
-        gr.Checkbox(label="Only do web search", value=False),  # Add this line
         gr.Slider(5, 20, value=3, step=1, label="Number of initial results"),
         gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
         gr.Dropdown(["", "day", "week", "month", "year"], value="week", label="Time Range"),
@@ -1231,7 +1021,7 @@ iface = gr.ChatInterface(
             label="Engines"
         ),
         gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
-        gr.Radio(["GET", "POST"], value="POST", label="HTTP Method"),
         gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
         gr.Dropdown(all_models, value=default_model, label="LLM Model"),
         gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=True),
@@ -1250,4 +1040,4 @@ iface = gr.ChatInterface(
 if __name__ == "__main__":
     logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
-    iface.launch(share=False)

 import datetime
 from abc import ABC, abstractmethod
 from typing import List, Dict, Any
 # Automatically get the current year
 CURRENT_YEAR = datetime.datetime.now().year
 all_models = ["huggingface", "groq", "mistral"] + custom_models
 # Determine the default model
+default_model = CUSTOM_LLM_DEFAULT_MODEL if CUSTOM_LLM_DEFAULT_MODEL in all_models else "groq"
 logger.info(f"Default model selected: {default_model}")
         doc_texts.append(doc_text)
     return doc_texts, documents
 # Now modify the rerank_documents_with_priority function to include BM25 ranking
+def rerank_documents(query: str, documents: List[Dict],
+                    similarity_threshold: float = 0.95, max_results: int = 5) -> List[Dict]:
     try:
         if not documents:
+            logger.warning("No documents to rerank.")
             return documents
+        # Step 1: Prepare documents for BM25
         doc_texts, original_docs = prepare_documents_for_bm25(documents)
+        # Step 2: Initialize and fit BM25
         bm25 = BM25()
         bm25.fit(doc_texts)
+        # Step 3: Get BM25 scores
         bm25_scores = bm25.get_scores(query)
+        # Step 4: Get semantic similarity scores
+        query_embedding = similarity_model.encode(query, convert_to_tensor=True)
+        doc_summaries = [doc['summary'] for doc in documents]
+        doc_embeddings = similarity_model.encode(doc_summaries, convert_to_tensor=True)
+        semantic_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
+        # Step 5: Combine scores (normalize first)
         bm25_scores_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
+        semantic_scores_norm = (semantic_scores - torch.min(semantic_scores)) / (torch.max(semantic_scores) - torch.min(semantic_scores))
+        # Combine scores with weights (0.4 for BM25, 0.6 for semantic similarity)
+        combined_scores = 0.4 * bm25_scores_norm + 0.6 * semantic_scores_norm.numpy()
+        # Create scored documents with combined scores
+        scored_documents = list(zip(documents, combined_scores))
+        # Sort by combined score (descending)
         scored_documents.sort(key=lambda x: x[1], reverse=True)
+        # Filter similar documents
+        filtered_docs = []
+        added_contents = []
+        for doc, score in scored_documents:
+            if score < 0.3:  # Minimum relevance threshold
+                continue
+            # Check similarity with already selected documents
+            doc_embedding = similarity_model.encode(doc['summary'], convert_to_tensor=True)
+            is_similar = False
+            for content in added_contents:
+                content_embedding = similarity_model.encode(content, convert_to_tensor=True)
+                similarity = util.pytorch_cos_sim(doc_embedding, content_embedding)
+                if similarity > similarity_threshold:
+                    is_similar = True
+                    break
+            if not is_similar:
+                filtered_docs.append(doc)
+                added_contents.append(doc['summary'])
+            if len(filtered_docs) >= max_results:
+                break
+        logger.info(f"Reranked and filtered to {len(filtered_docs)} unique documents using BM25 and semantic similarity.")
+        return filtered_docs
     except Exception as e:
+        logger.error(f"Error during reranking documents: {e}")
+        return documents[:max_results]  # Fallback to first max_results documents if reranking fails
 def compute_similarity(text1, text2):
     # Encode the texts
     use_pydf2: bool = True
 ):
     try:
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
         logger.info(f"Rephrased Query: {rephrased_query}")
             logger.info("No need to perform search based on the rephrased query.")
             return "No search needed for the provided input."
+        # Step 2: Perform search
+        # Search query parameters
         params = {
             'q': rephrased_query,
             'format': 'json',
         # Remove empty parameters
         params = {k: v for k, v in params.items() if v != ""}
+        # If no engines are specified, set default engines
         if 'engines' not in params:
+            params['engines'] = 'google'  # Default to 'google' or any preferred engine
             logger.info("No engines specified. Defaulting to 'google'.")
+        # Headers for SearXNG request
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
             'Accept': 'application/json, text/javascript, */*; q=0.01',
         scraped_content = []
         page = 1
         while len(scraped_content) < num_results:
+            # Update params with current page
             params['pageno'] = page
+            # Send request to SearXNG
+            logger.info(f"Sending request to SearXNG for query: {rephrased_query} (Page {page})")
+            session = requests_retry_session()
             try:
                 if method.upper() == "GET":
                     response = session.get(SEARXNG_URL, params=params, headers=headers, timeout=10, verify=certifi.where())
+                else:  # POST
                     response = session.post(SEARXNG_URL, data=params, headers=headers, timeout=10, verify=certifi.where())
                 response.raise_for_status()
                 return f"An error occurred during the search request: {e}"
             search_results = response.json()
+            logger.debug(f"SearXNG Response: {search_results}")
             results = search_results.get('results', [])
             if not results:
                 logger.warning(f"No more results returned from SearXNG on page {page}.")
                 break
             for result in results:
                 if len(scraped_content) >= num_results:
                     break
                 url = result.get('url', '')
                 title = result.get('title', 'No title')
                 if not is_valid_url(url):
                     logger.warning(f"Invalid URL: {url}")
                     continue
                 try:
                     logger.info(f"Processing content from: {url}")
                     content = scrape_full_content(url, max_chars, timeout, use_pydf2)
+                    if content is None:  # This means it's a PDF and use_pydf2 is False
                         continue
                     if not content:
                         logger.warning(f"Failed to scrape content from {url}")
                         continue
                     scraped_content.append({
                         "title": title,
                         "url": url,
                         "content": content,
+                        "scraper": "pdf" if url.lower().endswith('.pdf') else "newspaper"
                     })
+                    logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
                 except requests.exceptions.RequestException as e:
                     logger.error(f"Error scraping {url}: {e}")
                 except Exception as e:
             logger.warning("No content scraped from search results.")
             return "No content could be scraped from the search results."
+        logger.info(f"Successfully scraped {len(scraped_content)} documents.")
+        # Step 4: Assess relevance, summarize, and check for uniqueness
         relevant_documents = []
+        unique_summaries = []
         for doc in scraped_content:
             assessment = assess_relevance_and_summarize(client, rephrased_query, doc, temperature=llm_temperature)
             relevance, summary = assessment.split('\n', 1)
             if relevance.strip().lower() == "relevant: yes":
                 summary_text = summary.replace("Summary: ", "").strip()
+                if is_content_unique(summary_text, unique_summaries):
                     relevant_documents.append({
                         "title": doc['title'],
                         "url": doc['url'],
                         "summary": summary_text,
+                        "scraper": doc['scraper']
                     })
+                    unique_summaries.append(summary_text)
+                else:
+                    logger.info(f"Skipping similar content: {doc['title']}")
         if not relevant_documents:
             logger.warning("No relevant and unique documents found.")
+            return "No relevant and unique news found for the given query."
+        # Step 5: Rerank documents based on similarity to query
+        reranked_docs = rerank_documents(rephrased_query, relevant_documents, similarity_threshold=0.95, max_results=num_results)
         if not reranked_docs:
             logger.warning("No documents remained after reranking.")
+            return "No relevant news found after filtering and ranking."
+        logger.info(f"Reranked and filtered to top {len(reranked_docs)} unique, related documents.")
+        # Step 5: Scrape full content for top documents (up to num_results)
+        for doc in reranked_docs[:num_results]:
+            full_content = scrape_full_content(doc['url'], max_chars)
+            doc['full_content'] = full_content
+        # Prepare JSON for LLM
         llm_input = {
             "query": query,
             "documents": [
                     "title": doc['title'],
                     "url": doc['url'],
                     "summary": doc['summary'],
+                    "full_content": doc['full_content']
+                } for doc in reranked_docs[:num_results]
             ]
         }
+        # Step 6: LLM Summarization
         llm_summary = llm_summarize(json.dumps(llm_input), model, temperature=llm_temperature)
         return llm_summary
         logger.error(f"Unexpected error in search_and_scrape: {e}")
         return f"An unexpected error occurred during the search and scrape process: {e}"
 # Helper function to get the appropriate client for each model
 def get_client_for_model(model: str) -> Any:
     if model == "huggingface":
     description="Ask Sentinel any question. It will search the web for recent information or use its knowledge base as appropriate.",
     theme=gr.Theme.from_hub("allenai/gradio-theme"),
     additional_inputs=[
+        gr.Checkbox(label="Only do web search", value=True),  # Add this line
         gr.Slider(5, 20, value=3, step=1, label="Number of initial results"),
         gr.Slider(500, 10000, value=1500, step=100, label="Max characters to retrieve"),
         gr.Dropdown(["", "day", "week", "month", "year"], value="week", label="Time Range"),
             label="Engines"
         ),
         gr.Slider(0, 2, value=2, step=1, label="Safe Search Level"),
+        gr.Radio(["GET", "POST"], value="GET", label="HTTP Method"),
         gr.Slider(0, 1, value=0.2, step=0.1, label="LLM Temperature"),
         gr.Dropdown(all_models, value=default_model, label="LLM Model"),
         gr.Checkbox(label="Use PyPDF2 for PDF scraping", value=True),
 if __name__ == "__main__":
     logger.info("Starting the SearXNG Scraper for News using ChatInterface with Advanced Parameters")
+    iface.launch(share=True)