Spaces:

Nymbo
/

Tools

Running

App Files Files

xet

Community

Nymbo commited on Sep 15

Commit

cacc654

verified ·

1 Parent(s): 2ea6e76

Big improvements to Fetch and Web Search

Browse files

Files changed (1) hide show

app.py +185 -55

app.py CHANGED Viewed

@@ -209,9 +209,54 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
     return clean_text, s
-def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str:
-    # Remove unwanted elements globally first
     for element in full_soup.select("script, style, nav, footer, header, aside"):
         element.decompose()
@@ -270,32 +315,28 @@ def _truncate_markdown(markdown: str, max_chars: int) -> str:
 def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
-    verbosity: Annotated[str, "Controls output length: 'Brief' (1000 chars), 'Standard' (3000 chars), or 'Full' (complete page)."] = "Standard",
 ) -> str:
     """
-    Fetch a web page and return it converted to Markdown format with configurable length.
-    This function retrieves a webpage and converts its main content to clean Markdown,
-    preserving headings, formatting, and structure. It automatically removes navigation,
-    footers, scripts, and other non-content elements to focus on the main article or
-    content area.
     Args:
         url (str): The absolute URL to fetch (must return HTML).
-        verbosity (str): Controls output length:
-            - "Brief": Truncate to 1000 characters for quick summaries
-            - "Standard": Truncate to 3000 characters for balanced content
-            - "Full": Return complete page content with no length limit
     Returns:
-        str: The webpage content converted to Markdown format with:
-            - Page title as H1 header
-            - Main content converted to clean Markdown
-            - Preserved heading hierarchy
-            - Clean formatting without navigation/sidebar elements
-            - Length controlled by verbosity setting
     """
-    _log_call_start("Fetch_Webpage", url=url, verbosity=verbosity)
     if not url or not url.strip():
         result = "Please enter a valid URL."
         _log_call_end("Fetch_Webpage", _truncate_for_log(result))
@@ -320,18 +361,21 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     resp.encoding = resp.encoding or resp.apparent_encoding
     html = resp.text
-    # Parse HTML and convert to full-page Markdown
     full_soup = BeautifulSoup(html, "lxml")
-    markdown_content = _fullpage_markdown_from_soup(full_soup, final_url)
-    # Apply verbosity-based truncation
-    if verbosity == "Brief":
-        result = _truncate_markdown(markdown_content, 1000)
-    elif verbosity == "Standard":
-        result = _truncate_markdown(markdown_content, 3000)
-    else:  # "Full"
-        result = markdown_content
-    _log_call_end("Fetch_Webpage", f"markdown_chars={len(result)}")
     return result
@@ -412,12 +456,45 @@ def _log_call_end(func_name: str, output_desc: str) -> None:
     except Exception as e:  # pragma: no cover
         print(f"[TOOL RESULT] {func_name} (failed to log output: {e})", flush=True)
 def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
     max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
 ) -> str:
     """
-    Run a DuckDuckGo search and return numbered results with URLs, titles, and summaries.
     Args:
         query (str): The search query string. Supports operators like site:, quotes for exact matching,
@@ -427,27 +504,36 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
                - Site search: "site:example.com"
                - Exact phrase: "artificial intelligence"
                - Exclude terms: "cats -dogs"
-        max_results (int): Number of results to return (1–20). Default: 5.
     Returns:
-        str: Search results in readable format with titles, URLs, and snippets as a numbered list.
     """
-    _log_call_start("Search_DuckDuckGo", query=query, max_results=max_results)
     if not query or not query.strip():
         result = "No search query provided. Please enter a search term."
         _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
         return result
-    # Validate max_results
     max_results = max(1, min(20, max_results))
     try:
         # Apply rate limiting to avoid being blocked
         _search_rate_limiter.acquire()
         # Perform search with timeout handling
         with DDGS() as ddgs:
-            raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
         error_msg = f"Search failed: {str(e)[:200]}"
@@ -466,9 +552,16 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
         _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
         return result
-    results = []
-    for r in raw or []:
         title = (r.get("title") or "").strip()
         url = (r.get("href") or r.get("link") or "").strip()
         body = (r.get("body") or r.get("snippet") or "").strip()
@@ -476,29 +569,46 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
         if not url:
             continue
         result_obj = {
             "title": title or _domain_of(url),
             "url": url,
-            "snippet": body
         }
         results.append(result_obj)
     if not results:
-        result = f"No valid results found for query: {query}"
         _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
         return result
-    # Format output in readable format
-    lines = [f"Found {len(results)} search results for: {query}\n"]
-    for i, result in enumerate(results, 1):
         lines.append(f"{i}. {result['title']}")
         lines.append(f"   URL: {result['url']}")
         if result['snippet']:
             lines.append(f"   Summary: {result['snippet']}")
         lines.append("")  # Empty line between results
     result = "\n".join(lines)
-    _log_call_end("Search_DuckDuckGo", f"results={len(results)} chars={len(result)}")
     return result
@@ -985,21 +1095,38 @@ fetch_interface = gr.Interface(
     fn=Fetch_Webpage,
     inputs=[
         gr.Textbox(label="URL", placeholder="https://example.com/article"),
-        gr.Dropdown(
-            label="Verbosity",
-            choices=["Brief", "Standard", "Full"],
-            value="Standard",
-            info="Brief: 1000 chars, Standard: 3000 chars, Full: complete page"
         ),
     ],
-    outputs=gr.Markdown(label="Extracted Markdown"),
     title="Fetch Webpage",
     description=(
-        "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with configurable length, preserving structure and formatting while removing navigation and clutter.</div>"
     ),
     api_description=(
-        "Fetch a web page and return it converted to Markdown format with configurable length. "
-        "Parameters: url (str - absolute URL), verbosity (str - Brief/Standard/Full controlling output length: Brief=1000 chars, Standard=3000 chars, Full=complete page)."
     ),
     flagging_mode="never",
 )
@@ -1010,17 +1137,20 @@ concise_interface = gr.Interface(
     inputs=[
         gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
         gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
     ],
     outputs=gr.Textbox(label="Search Results", interactive=False),
     title="DuckDuckGo Search",
     description=(
-        "<div style=\"text-align:center\">Web search with readable output format. Supports advanced search operators.</div>"
     ),
     api_description=(
-        "Run a DuckDuckGo search and return numbered results with URLs, titles, and summaries. "
         "Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
         "OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
-        "'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'."
     ),
     flagging_mode="never",
     submit_btn="Search",

     return clean_text, s
+def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
+    """
+    Extract all links from the page and return as formatted text.
+    """
+    links = []
+    for link in soup.find_all("a", href=True):
+        href = link.get("href")
+        text = link.get_text(strip=True)
+        # Make relative URLs absolute
+        if href.startswith("http"):
+            full_url = href
+        elif href.startswith("//"):
+            full_url = "https:" + href
+        elif href.startswith("/"):
+            from urllib.parse import urljoin
+            full_url = urljoin(base_url, href)
+        else:
+            from urllib.parse import urljoin
+            full_url = urljoin(base_url, href)
+        if text and href not in ["#", "javascript:void(0)"]:
+            links.append(f"- [{text}]({full_url})")
+    if not links:
+        return "No links found on this page."
+    # Add title if present
+    title = soup.find("title")
+    title_text = title.get_text(strip=True) if title else "Links from webpage"
+    return f"# {title_text}\n\n" + "\n".join(links)
+def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str, strip_selectors: str = "") -> str:
+    # Remove custom selectors first if provided
+    if strip_selectors:
+        selectors = [s.strip() for s in strip_selectors.split(",") if s.strip()]
+        for selector in selectors:
+            try:
+                for element in full_soup.select(selector):
+                    element.decompose()
+            except Exception:
+                # Invalid CSS selector, skip it
+                continue
+    # Remove unwanted elements globally
     for element in full_soup.select("script, style, nav, footer, header, aside"):
         element.decompose()
 def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
+    max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
+    strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
+    url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
 ) -> str:
     """
+    Fetch a web page and return it converted to Markdown format with configurable options.
+    This function retrieves a webpage and either converts its main content to clean Markdown
+    or extracts all links from the page. It automatically removes navigation, footers,
+    scripts, and other non-content elements, plus any custom selectors you specify.
     Args:
         url (str): The absolute URL to fetch (must return HTML).
+        max_chars (int): Maximum characters to return. Use 0 for no limit (full page).
+        strip_selectors (str): CSS selectors to remove before processing (comma-separated).
+        url_scraper (bool): If True, extract only links instead of content.
     Returns:
+        str: Either the webpage content converted to Markdown or a list of all links,
+             depending on the url_scraper setting. Content is length-limited by max_chars.
     """
+    _log_call_start("Fetch_Webpage", url=url, max_chars=max_chars, strip_selectors=strip_selectors, url_scraper=url_scraper)
     if not url or not url.strip():
         result = "Please enter a valid URL."
         _log_call_end("Fetch_Webpage", _truncate_for_log(result))
     resp.encoding = resp.encoding or resp.apparent_encoding
     html = resp.text
+    # Parse HTML
     full_soup = BeautifulSoup(html, "lxml")
+    if url_scraper:
+        # Extract links mode
+        result = _extract_links_from_soup(full_soup, final_url)
+    else:
+        # Convert to markdown mode
+        result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
+    # Apply max_chars truncation if specified
+    if max_chars > 0 and len(result) > max_chars:
+        result = _truncate_markdown(result, max_chars)
+    _log_call_end("Fetch_Webpage", f"chars={len(result)}, url_scraper={url_scraper}")
     return result
     except Exception as e:  # pragma: no cover
         print(f"[TOOL RESULT] {func_name} (failed to log output: {e})", flush=True)
+def _extract_date_from_snippet(snippet: str) -> str:
+    """
+    Extract publication date from search result snippet using common patterns.
+    """
+    import re
+    from datetime import datetime
+    if not snippet:
+        return ""
+    # Common date patterns
+    date_patterns = [
+        # ISO format: 2023-12-25, 2023/12/25
+        r'\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b',
+        # US format: Dec 25, 2023 | December 25, 2023
+        r'\b([A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})\b',
+        # EU format: 25 Dec 2023 | 25 December 2023
+        r'\b(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})\b',
+        # Relative: "2 days ago", "1 week ago", "3 months ago"
+        r'\b(\d+\s+(?:day|week|month|year)s?\s+ago)\b',
+        # Common prefixes: "Published: ", "Updated: ", "Posted: "
+        r'(?:Published|Updated|Posted):\s*([^,\n]+?)(?:[,\n]|$)',
+    ]
+    for pattern in date_patterns:
+        matches = re.findall(pattern, snippet, re.IGNORECASE)
+        if matches:
+            return matches[0].strip()
+    return ""
 def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
     max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
+    page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
 ) -> str:
     """
+    Run a DuckDuckGo search and return numbered results with URLs, titles, snippets, and dates.
     Args:
         query (str): The search query string. Supports operators like site:, quotes for exact matching,
                - Site search: "site:example.com"
                - Exact phrase: "artificial intelligence"
                - Exclude terms: "cats -dogs"
+        max_results (int): Number of results to return per page (1–20). Default: 5.
+        page (int): Page number for pagination (1-based). Default: 1.
     Returns:
+        str: Search results in readable format with titles, URLs, snippets, and publication dates
+             when available, formatted as a numbered list with pagination info.
     """
+    _log_call_start("Search_DuckDuckGo", query=query, max_results=max_results, page=page)
     if not query or not query.strip():
         result = "No search query provided. Please enter a search term."
         _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
         return result
+    # Validate parameters
     max_results = max(1, min(20, max_results))
+    page = max(1, page)
+    # Calculate offset for pagination
+    offset = (page - 1) * max_results
+    total_needed = offset + max_results
     try:
         # Apply rate limiting to avoid being blocked
         _search_rate_limiter.acquire()
         # Perform search with timeout handling
+        # We need to get more results than needed for pagination
         with DDGS() as ddgs:
+            raw_gen = ddgs.text(query, max_results=total_needed + 10)  # Get extra for safety
+            raw = list(raw_gen)
     except Exception as e:
         error_msg = f"Search failed: {str(e)[:200]}"
         _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
         return result
+    # Apply pagination by slicing the results
+    paginated_results = raw[offset:offset + max_results]
+    if not paginated_results:
+        result = f"No results found on page {page} for query: {query}. Try page 1 or reduce page number."
+        _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
+        return result
+    results = []
+    for r in paginated_results:
         title = (r.get("title") or "").strip()
         url = (r.get("href") or r.get("link") or "").strip()
         body = (r.get("body") or r.get("snippet") or "").strip()
         if not url:
             continue
+        # Extract date from snippet
+        date_found = _extract_date_from_snippet(body)
         result_obj = {
             "title": title or _domain_of(url),
             "url": url,
+            "snippet": body,
+            "date": date_found
         }
         results.append(result_obj)
     if not results:
+        result = f"No valid results found on page {page} for query: {query}"
         _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
         return result
+    # Format output in readable format with pagination info
+    total_available = len(raw)
+    start_num = offset + 1
+    end_num = offset + len(results)
+    lines = [f"Search results for: {query}"]
+    lines.append(f"Page {page} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
+    for i, result in enumerate(results, start_num):
         lines.append(f"{i}. {result['title']}")
         lines.append(f"   URL: {result['url']}")
         if result['snippet']:
             lines.append(f"   Summary: {result['snippet']}")
+        if result['date']:
+            lines.append(f"   Date: {result['date']}")
         lines.append("")  # Empty line between results
+    # Add pagination hint
+    if total_available > end_num:
+        lines.append(f"💡 More results available - use page={page + 1} to see next {max_results} results")
     result = "\n".join(lines)
+    _log_call_end("Search_DuckDuckGo", f"page={page} results={len(results)} chars={len(result)}")
     return result
     fn=Fetch_Webpage,
     inputs=[
         gr.Textbox(label="URL", placeholder="https://example.com/article"),
+        gr.Slider(
+            minimum=0,
+            maximum=20000,
+            value=3000,
+            step=100,
+            label="Max Characters",
+            info="0 = no limit (full page), default 3000"
+        ),
+        gr.Textbox(
+            label="Strip Selectors",
+            placeholder=".header, .footer, nav, .sidebar",
+            value="",
+            info="CSS selectors to remove (comma-separated)"
+        ),
+        gr.Checkbox(
+            label="URL Scraper",
+            value=False,
+            info="Extract only links instead of content"
         ),
     ],
+    outputs=gr.Markdown(label="Extracted Content"),
     title="Fetch Webpage",
     description=(
+        "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, or extract all links. Supports custom element removal and length limits.</div>"
     ),
     api_description=(
+        "Fetch a web page and return it converted to Markdown format or extract links with configurable options. "
+        "Parameters: url (str - absolute URL), max_chars (int - 0=no limit, default 3000), "
+        "strip_selectors (str - CSS selectors to remove, comma-separated), "
+        "url_scraper (bool - extract only links instead of content, default False). "
+        "When url_scraper=True, returns formatted list of all links found on the page. "
+        "When False, returns clean Markdown content with custom element removal and length control."
     ),
     flagging_mode="never",
 )
     inputs=[
         gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
         gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
+        gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination"),
     ],
     outputs=gr.Textbox(label="Search Results", interactive=False),
     title="DuckDuckGo Search",
     description=(
+        "<div style=\"text-align:center\">Web search with readable output format, date detection, and pagination support. Supports advanced search operators.</div>"
     ),
     api_description=(
+        "Run a DuckDuckGo search and return numbered results with URLs, titles, summaries, and publication dates when detectable. "
         "Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
         "OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
+        "'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'. "
+        "Parameters: query (str), max_results (int, 1-20), page (int, 1-based pagination). "
+        "Returns formatted results with date metadata and pagination hints for accessing more results."
     ),
     flagging_mode="never",
     submit_btn="Search",