Nymbo commited on
Commit
cacc654
·
verified ·
1 Parent(s): 2ea6e76

Big improvements to Fetch and Web Search

Browse files
Files changed (1) hide show
  1. app.py +185 -55
app.py CHANGED
@@ -209,9 +209,54 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
209
  return clean_text, s
210
 
211
 
212
- def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
- # Remove unwanted elements globally first
 
 
 
 
 
 
 
 
 
 
 
215
  for element in full_soup.select("script, style, nav, footer, header, aside"):
216
  element.decompose()
217
 
@@ -270,32 +315,28 @@ def _truncate_markdown(markdown: str, max_chars: int) -> str:
270
 
271
  def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
272
  url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
273
- verbosity: Annotated[str, "Controls output length: 'Brief' (1000 chars), 'Standard' (3000 chars), or 'Full' (complete page)."] = "Standard",
 
 
274
  ) -> str:
275
  """
276
- Fetch a web page and return it converted to Markdown format with configurable length.
277
 
278
- This function retrieves a webpage and converts its main content to clean Markdown,
279
- preserving headings, formatting, and structure. It automatically removes navigation,
280
- footers, scripts, and other non-content elements to focus on the main article or
281
- content area.
282
 
283
  Args:
284
  url (str): The absolute URL to fetch (must return HTML).
285
- verbosity (str): Controls output length:
286
- - "Brief": Truncate to 1000 characters for quick summaries
287
- - "Standard": Truncate to 3000 characters for balanced content
288
- - "Full": Return complete page content with no length limit
289
 
290
  Returns:
291
- str: The webpage content converted to Markdown format with:
292
- - Page title as H1 header
293
- - Main content converted to clean Markdown
294
- - Preserved heading hierarchy
295
- - Clean formatting without navigation/sidebar elements
296
- - Length controlled by verbosity setting
297
  """
298
- _log_call_start("Fetch_Webpage", url=url, verbosity=verbosity)
299
  if not url or not url.strip():
300
  result = "Please enter a valid URL."
301
  _log_call_end("Fetch_Webpage", _truncate_for_log(result))
@@ -320,18 +361,21 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
320
  resp.encoding = resp.encoding or resp.apparent_encoding
321
  html = resp.text
322
 
323
- # Parse HTML and convert to full-page Markdown
324
  full_soup = BeautifulSoup(html, "lxml")
325
- markdown_content = _fullpage_markdown_from_soup(full_soup, final_url)
326
 
327
- # Apply verbosity-based truncation
328
- if verbosity == "Brief":
329
- result = _truncate_markdown(markdown_content, 1000)
330
- elif verbosity == "Standard":
331
- result = _truncate_markdown(markdown_content, 3000)
332
- else: # "Full"
333
- result = markdown_content
334
- _log_call_end("Fetch_Webpage", f"markdown_chars={len(result)}")
 
 
 
 
335
  return result
336
 
337
 
@@ -412,12 +456,45 @@ def _log_call_end(func_name: str, output_desc: str) -> None:
412
  except Exception as e: # pragma: no cover
413
  print(f"[TOOL RESULT] {func_name} (failed to log output: {e})", flush=True)
414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
416
  query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
417
  max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
 
418
  ) -> str:
419
  """
420
- Run a DuckDuckGo search and return numbered results with URLs, titles, and summaries.
421
 
422
  Args:
423
  query (str): The search query string. Supports operators like site:, quotes for exact matching,
@@ -427,27 +504,36 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
427
  - Site search: "site:example.com"
428
  - Exact phrase: "artificial intelligence"
429
  - Exclude terms: "cats -dogs"
430
- max_results (int): Number of results to return (1–20). Default: 5.
 
431
 
432
  Returns:
433
- str: Search results in readable format with titles, URLs, and snippets as a numbered list.
 
434
  """
435
- _log_call_start("Search_DuckDuckGo", query=query, max_results=max_results)
436
  if not query or not query.strip():
437
  result = "No search query provided. Please enter a search term."
438
  _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
439
  return result
440
 
441
- # Validate max_results
442
  max_results = max(1, min(20, max_results))
 
 
 
 
 
443
 
444
  try:
445
  # Apply rate limiting to avoid being blocked
446
  _search_rate_limiter.acquire()
447
 
448
  # Perform search with timeout handling
 
449
  with DDGS() as ddgs:
450
- raw = ddgs.text(query, max_results=max_results)
 
451
 
452
  except Exception as e:
453
  error_msg = f"Search failed: {str(e)[:200]}"
@@ -466,9 +552,16 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
466
  _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
467
  return result
468
 
469
- results = []
 
 
 
 
 
 
470
 
471
- for r in raw or []:
 
472
  title = (r.get("title") or "").strip()
473
  url = (r.get("href") or r.get("link") or "").strip()
474
  body = (r.get("body") or r.get("snippet") or "").strip()
@@ -476,29 +569,46 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
476
  if not url:
477
  continue
478
 
 
 
 
479
  result_obj = {
480
  "title": title or _domain_of(url),
481
  "url": url,
482
- "snippet": body
 
483
  }
484
 
485
  results.append(result_obj)
486
 
487
  if not results:
488
- result = f"No valid results found for query: {query}"
489
  _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
490
  return result
491
 
492
- # Format output in readable format
493
- lines = [f"Found {len(results)} search results for: {query}\n"]
494
- for i, result in enumerate(results, 1):
 
 
 
 
 
 
495
  lines.append(f"{i}. {result['title']}")
496
  lines.append(f" URL: {result['url']}")
497
  if result['snippet']:
498
  lines.append(f" Summary: {result['snippet']}")
 
 
499
  lines.append("") # Empty line between results
 
 
 
 
 
500
  result = "\n".join(lines)
501
- _log_call_end("Search_DuckDuckGo", f"results={len(results)} chars={len(result)}")
502
  return result
503
 
504
 
@@ -985,21 +1095,38 @@ fetch_interface = gr.Interface(
985
  fn=Fetch_Webpage,
986
  inputs=[
987
  gr.Textbox(label="URL", placeholder="https://example.com/article"),
988
- gr.Dropdown(
989
- label="Verbosity",
990
- choices=["Brief", "Standard", "Full"],
991
- value="Standard",
992
- info="Brief: 1000 chars, Standard: 3000 chars, Full: complete page"
 
 
 
 
 
 
 
 
 
 
 
 
 
993
  ),
994
  ],
995
- outputs=gr.Markdown(label="Extracted Markdown"),
996
  title="Fetch Webpage",
997
  description=(
998
- "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with configurable length, preserving structure and formatting while removing navigation and clutter.</div>"
999
  ),
1000
  api_description=(
1001
- "Fetch a web page and return it converted to Markdown format with configurable length. "
1002
- "Parameters: url (str - absolute URL), verbosity (str - Brief/Standard/Full controlling output length: Brief=1000 chars, Standard=3000 chars, Full=complete page)."
 
 
 
 
1003
  ),
1004
  flagging_mode="never",
1005
  )
@@ -1010,17 +1137,20 @@ concise_interface = gr.Interface(
1010
  inputs=[
1011
  gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
1012
  gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
 
1013
  ],
1014
  outputs=gr.Textbox(label="Search Results", interactive=False),
1015
  title="DuckDuckGo Search",
1016
  description=(
1017
- "<div style=\"text-align:center\">Web search with readable output format. Supports advanced search operators.</div>"
1018
  ),
1019
  api_description=(
1020
- "Run a DuckDuckGo search and return numbered results with URLs, titles, and summaries. "
1021
  "Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
1022
  "OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
1023
- "'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'."
 
 
1024
  ),
1025
  flagging_mode="never",
1026
  submit_btn="Search",
 
209
  return clean_text, s
210
 
211
 
212
+ def _extract_links_from_soup(soup: BeautifulSoup, base_url: str) -> str:
213
+ """
214
+ Extract all links from the page and return as formatted text.
215
+ """
216
+ links = []
217
+ for link in soup.find_all("a", href=True):
218
+ href = link.get("href")
219
+ text = link.get_text(strip=True)
220
+
221
+ # Make relative URLs absolute
222
+ if href.startswith("http"):
223
+ full_url = href
224
+ elif href.startswith("//"):
225
+ full_url = "https:" + href
226
+ elif href.startswith("/"):
227
+ from urllib.parse import urljoin
228
+ full_url = urljoin(base_url, href)
229
+ else:
230
+ from urllib.parse import urljoin
231
+ full_url = urljoin(base_url, href)
232
+
233
+ if text and href not in ["#", "javascript:void(0)"]:
234
+ links.append(f"- [{text}]({full_url})")
235
+
236
+ if not links:
237
+ return "No links found on this page."
238
+
239
+ # Add title if present
240
+ title = soup.find("title")
241
+ title_text = title.get_text(strip=True) if title else "Links from webpage"
242
+
243
+ return f"# {title_text}\n\n" + "\n".join(links)
244
+
245
+
246
+ def _fullpage_markdown_from_soup(full_soup: BeautifulSoup, base_url: str, strip_selectors: str = "") -> str:
247
 
248
+ # Remove custom selectors first if provided
249
+ if strip_selectors:
250
+ selectors = [s.strip() for s in strip_selectors.split(",") if s.strip()]
251
+ for selector in selectors:
252
+ try:
253
+ for element in full_soup.select(selector):
254
+ element.decompose()
255
+ except Exception:
256
+ # Invalid CSS selector, skip it
257
+ continue
258
+
259
+ # Remove unwanted elements globally
260
  for element in full_soup.select("script, style, nav, footer, header, aside"):
261
  element.decompose()
262
 
 
315
 
316
  def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
317
  url: Annotated[str, "The absolute URL to fetch (must return HTML)."],
318
+ max_chars: Annotated[int, "Maximum characters to return (0 = no limit, full page content)."] = 3000,
319
+ strip_selectors: Annotated[str, "CSS selectors to remove (comma-separated, e.g., '.header, .footer, nav')."] = "",
320
+ url_scraper: Annotated[bool, "Extract only links from the page instead of content."] = False,
321
  ) -> str:
322
  """
323
+ Fetch a web page and return it converted to Markdown format with configurable options.
324
 
325
+ This function retrieves a webpage and either converts its main content to clean Markdown
326
+ or extracts all links from the page. It automatically removes navigation, footers,
327
+ scripts, and other non-content elements, plus any custom selectors you specify.
 
328
 
329
  Args:
330
  url (str): The absolute URL to fetch (must return HTML).
331
+ max_chars (int): Maximum characters to return. Use 0 for no limit (full page).
332
+ strip_selectors (str): CSS selectors to remove before processing (comma-separated).
333
+ url_scraper (bool): If True, extract only links instead of content.
 
334
 
335
  Returns:
336
+ str: Either the webpage content converted to Markdown or a list of all links,
337
+ depending on the url_scraper setting. Content is length-limited by max_chars.
 
 
 
 
338
  """
339
+ _log_call_start("Fetch_Webpage", url=url, max_chars=max_chars, strip_selectors=strip_selectors, url_scraper=url_scraper)
340
  if not url or not url.strip():
341
  result = "Please enter a valid URL."
342
  _log_call_end("Fetch_Webpage", _truncate_for_log(result))
 
361
  resp.encoding = resp.encoding or resp.apparent_encoding
362
  html = resp.text
363
 
364
+ # Parse HTML
365
  full_soup = BeautifulSoup(html, "lxml")
 
366
 
367
+ if url_scraper:
368
+ # Extract links mode
369
+ result = _extract_links_from_soup(full_soup, final_url)
370
+ else:
371
+ # Convert to markdown mode
372
+ result = _fullpage_markdown_from_soup(full_soup, final_url, strip_selectors)
373
+
374
+ # Apply max_chars truncation if specified
375
+ if max_chars > 0 and len(result) > max_chars:
376
+ result = _truncate_markdown(result, max_chars)
377
+
378
+ _log_call_end("Fetch_Webpage", f"chars={len(result)}, url_scraper={url_scraper}")
379
  return result
380
 
381
 
 
456
  except Exception as e: # pragma: no cover
457
  print(f"[TOOL RESULT] {func_name} (failed to log output: {e})", flush=True)
458
 
459
+ def _extract_date_from_snippet(snippet: str) -> str:
460
+ """
461
+ Extract publication date from search result snippet using common patterns.
462
+ """
463
+ import re
464
+ from datetime import datetime
465
+
466
+ if not snippet:
467
+ return ""
468
+
469
+ # Common date patterns
470
+ date_patterns = [
471
+ # ISO format: 2023-12-25, 2023/12/25
472
+ r'\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b',
473
+ # US format: Dec 25, 2023 | December 25, 2023
474
+ r'\b([A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})\b',
475
+ # EU format: 25 Dec 2023 | 25 December 2023
476
+ r'\b(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})\b',
477
+ # Relative: "2 days ago", "1 week ago", "3 months ago"
478
+ r'\b(\d+\s+(?:day|week|month|year)s?\s+ago)\b',
479
+ # Common prefixes: "Published: ", "Updated: ", "Posted: "
480
+ r'(?:Published|Updated|Posted):\s*([^,\n]+?)(?:[,\n]|$)',
481
+ ]
482
+
483
+ for pattern in date_patterns:
484
+ matches = re.findall(pattern, snippet, re.IGNORECASE)
485
+ if matches:
486
+ return matches[0].strip()
487
+
488
+ return ""
489
+
490
+
491
  def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
492
  query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],
493
  max_results: Annotated[int, "Number of results to return (1–20)."] = 5,
494
+ page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,
495
  ) -> str:
496
  """
497
+ Run a DuckDuckGo search and return numbered results with URLs, titles, snippets, and dates.
498
 
499
  Args:
500
  query (str): The search query string. Supports operators like site:, quotes for exact matching,
 
504
  - Site search: "site:example.com"
505
  - Exact phrase: "artificial intelligence"
506
  - Exclude terms: "cats -dogs"
507
+ max_results (int): Number of results to return per page (1–20). Default: 5.
508
+ page (int): Page number for pagination (1-based). Default: 1.
509
 
510
  Returns:
511
+ str: Search results in readable format with titles, URLs, snippets, and publication dates
512
+ when available, formatted as a numbered list with pagination info.
513
  """
514
+ _log_call_start("Search_DuckDuckGo", query=query, max_results=max_results, page=page)
515
  if not query or not query.strip():
516
  result = "No search query provided. Please enter a search term."
517
  _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
518
  return result
519
 
520
+ # Validate parameters
521
  max_results = max(1, min(20, max_results))
522
+ page = max(1, page)
523
+
524
+ # Calculate offset for pagination
525
+ offset = (page - 1) * max_results
526
+ total_needed = offset + max_results
527
 
528
  try:
529
  # Apply rate limiting to avoid being blocked
530
  _search_rate_limiter.acquire()
531
 
532
  # Perform search with timeout handling
533
+ # We need to get more results than needed for pagination
534
  with DDGS() as ddgs:
535
+ raw_gen = ddgs.text(query, max_results=total_needed + 10) # Get extra for safety
536
+ raw = list(raw_gen)
537
 
538
  except Exception as e:
539
  error_msg = f"Search failed: {str(e)[:200]}"
 
552
  _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
553
  return result
554
 
555
+ # Apply pagination by slicing the results
556
+ paginated_results = raw[offset:offset + max_results]
557
+
558
+ if not paginated_results:
559
+ result = f"No results found on page {page} for query: {query}. Try page 1 or reduce page number."
560
+ _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
561
+ return result
562
 
563
+ results = []
564
+ for r in paginated_results:
565
  title = (r.get("title") or "").strip()
566
  url = (r.get("href") or r.get("link") or "").strip()
567
  body = (r.get("body") or r.get("snippet") or "").strip()
 
569
  if not url:
570
  continue
571
 
572
+ # Extract date from snippet
573
+ date_found = _extract_date_from_snippet(body)
574
+
575
  result_obj = {
576
  "title": title or _domain_of(url),
577
  "url": url,
578
+ "snippet": body,
579
+ "date": date_found
580
  }
581
 
582
  results.append(result_obj)
583
 
584
  if not results:
585
+ result = f"No valid results found on page {page} for query: {query}"
586
  _log_call_end("Search_DuckDuckGo", _truncate_for_log(result))
587
  return result
588
 
589
+ # Format output in readable format with pagination info
590
+ total_available = len(raw)
591
+ start_num = offset + 1
592
+ end_num = offset + len(results)
593
+
594
+ lines = [f"Search results for: {query}"]
595
+ lines.append(f"Page {page} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
596
+
597
+ for i, result in enumerate(results, start_num):
598
  lines.append(f"{i}. {result['title']}")
599
  lines.append(f" URL: {result['url']}")
600
  if result['snippet']:
601
  lines.append(f" Summary: {result['snippet']}")
602
+ if result['date']:
603
+ lines.append(f" Date: {result['date']}")
604
  lines.append("") # Empty line between results
605
+
606
+ # Add pagination hint
607
+ if total_available > end_num:
608
+ lines.append(f"💡 More results available - use page={page + 1} to see next {max_results} results")
609
+
610
  result = "\n".join(lines)
611
+ _log_call_end("Search_DuckDuckGo", f"page={page} results={len(results)} chars={len(result)}")
612
  return result
613
 
614
 
 
1095
  fn=Fetch_Webpage,
1096
  inputs=[
1097
  gr.Textbox(label="URL", placeholder="https://example.com/article"),
1098
+ gr.Slider(
1099
+ minimum=0,
1100
+ maximum=20000,
1101
+ value=3000,
1102
+ step=100,
1103
+ label="Max Characters",
1104
+ info="0 = no limit (full page), default 3000"
1105
+ ),
1106
+ gr.Textbox(
1107
+ label="Strip Selectors",
1108
+ placeholder=".header, .footer, nav, .sidebar",
1109
+ value="",
1110
+ info="CSS selectors to remove (comma-separated)"
1111
+ ),
1112
+ gr.Checkbox(
1113
+ label="URL Scraper",
1114
+ value=False,
1115
+ info="Extract only links instead of content"
1116
  ),
1117
  ],
1118
+ outputs=gr.Markdown(label="Extracted Content"),
1119
  title="Fetch Webpage",
1120
  description=(
1121
+ "<div style=\"text-align:center\">Convert any webpage to clean Markdown format with precision controls, or extract all links. Supports custom element removal and length limits.</div>"
1122
  ),
1123
  api_description=(
1124
+ "Fetch a web page and return it converted to Markdown format or extract links with configurable options. "
1125
+ "Parameters: url (str - absolute URL), max_chars (int - 0=no limit, default 3000), "
1126
+ "strip_selectors (str - CSS selectors to remove, comma-separated), "
1127
+ "url_scraper (bool - extract only links instead of content, default False). "
1128
+ "When url_scraper=True, returns formatted list of all links found on the page. "
1129
+ "When False, returns clean Markdown content with custom element removal and length control."
1130
  ),
1131
  flagging_mode="never",
1132
  )
 
1137
  inputs=[
1138
  gr.Textbox(label="Query", placeholder="topic OR site:example.com"),
1139
  gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results"),
1140
+ gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination"),
1141
  ],
1142
  outputs=gr.Textbox(label="Search Results", interactive=False),
1143
  title="DuckDuckGo Search",
1144
  description=(
1145
+ "<div style=\"text-align:center\">Web search with readable output format, date detection, and pagination support. Supports advanced search operators.</div>"
1146
  ),
1147
  api_description=(
1148
+ "Run a DuckDuckGo search and return numbered results with URLs, titles, summaries, and publication dates when detectable. "
1149
  "Supports advanced search operators: site: for specific domains, quotes for exact phrases, "
1150
  "OR for alternatives, and - to exclude terms. Examples: 'Python programming', 'site:example.com', "
1151
+ "'\"artificial intelligence\"', 'cats -dogs', 'Python OR JavaScript'. "
1152
+ "Parameters: query (str), max_results (int, 1-20), page (int, 1-based pagination). "
1153
+ "Returns formatted results with date metadata and pagination hints for accessing more results."
1154
  ),
1155
  flagging_mode="never",
1156
  submit_btn="Search",