Update app.py
Browse files
app.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
| 1 |
# File: main/app.py
|
| 2 |
-
# Purpose: One Space that offers
|
| 3 |
# 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
|
| 4 |
# 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
|
| 5 |
-
# 3)
|
| 6 |
-
# 4) Python Code Executor — run Python code and capture stdout/errors
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
|
@@ -379,129 +378,6 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
|
|
| 379 |
return "\n".join(lines)
|
| 380 |
|
| 381 |
|
| 382 |
-
# ============================================
|
| 383 |
-
# Generate Sitemap (new MCP tool #5)
|
| 384 |
-
# ============================================
|
| 385 |
-
|
| 386 |
-
def Generate_Sitemap(
|
| 387 |
-
url: str,
|
| 388 |
-
max_links_per_domain: int = 0,
|
| 389 |
-
) -> str:
|
| 390 |
-
"""
|
| 391 |
-
Generate a grouped sitemap (Markdown) of anchor links on a page, with an optional
|
| 392 |
-
per-domain cap.
|
| 393 |
-
|
| 394 |
-
Args:
|
| 395 |
-
url (str): The starting page URL (http/https). If the scheme is omitted,
|
| 396 |
-
https is assumed.
|
| 397 |
-
max_links_per_domain (int): Limit the number of links shown per domain.
|
| 398 |
-
Use 0 to show all links.
|
| 399 |
-
|
| 400 |
-
Returns:
|
| 401 |
-
str: Markdown text containing grouped links under "Internal Links" and
|
| 402 |
-
per-domain "External Links (domain)" sections. If an error occurs or no
|
| 403 |
-
links are found, a short message is returned.
|
| 404 |
-
"""
|
| 405 |
-
# --- Basic validation & normalization ---
|
| 406 |
-
if not url or not url.strip():
|
| 407 |
-
return "Please enter a valid URL."
|
| 408 |
-
|
| 409 |
-
# If the user forgot the scheme, assume https
|
| 410 |
-
if not url.lower().startswith(("http://", "https://")):
|
| 411 |
-
url = "https://" + url.strip()
|
| 412 |
-
|
| 413 |
-
# --- Fetch the page safely ---
|
| 414 |
-
try:
|
| 415 |
-
resp = _http_get(url)
|
| 416 |
-
resp.raise_for_status()
|
| 417 |
-
except requests.exceptions.RequestException as e:
|
| 418 |
-
return f"Error fetching URL: {str(e)}"
|
| 419 |
-
|
| 420 |
-
base_url = str(resp.url) # follow redirects and use the final URL
|
| 421 |
-
content_type = resp.headers.get("Content-Type", "")
|
| 422 |
-
if "html" not in content_type.lower():
|
| 423 |
-
return "The provided URL does not appear to be an HTML page."
|
| 424 |
-
|
| 425 |
-
# --- Parse and collect links ---
|
| 426 |
-
soup = BeautifulSoup(resp.content, "lxml") # fast, lenient HTML parsing
|
| 427 |
-
anchors = soup.find_all("a", href=True)
|
| 428 |
-
|
| 429 |
-
seen_urls: set[str] = set()
|
| 430 |
-
items: List[Dict[str, str]] = []
|
| 431 |
-
|
| 432 |
-
for a in anchors:
|
| 433 |
-
href = (a.get("href") or "").strip()
|
| 434 |
-
if not href:
|
| 435 |
-
continue
|
| 436 |
-
|
| 437 |
-
# Skip non-navigational/unsupported schemes
|
| 438 |
-
if href.startswith(("#", "javascript:", "mailto:", "tel:")):
|
| 439 |
-
continue
|
| 440 |
-
|
| 441 |
-
# Resolve relative links and strip fragments
|
| 442 |
-
absolute = urljoin(base_url, href)
|
| 443 |
-
absolute, _ = urldefrag(absolute)
|
| 444 |
-
|
| 445 |
-
# Deduplicate and skip self
|
| 446 |
-
if absolute in seen_urls or absolute == base_url:
|
| 447 |
-
continue
|
| 448 |
-
seen_urls.add(absolute)
|
| 449 |
-
|
| 450 |
-
# Use link text if available; otherwise the URL itself
|
| 451 |
-
text = (a.get_text(" ", strip=True) or href).strip()
|
| 452 |
-
if len(text) > 100:
|
| 453 |
-
text = text[:100] + "..."
|
| 454 |
-
|
| 455 |
-
items.append({"text": text, "url": absolute})
|
| 456 |
-
|
| 457 |
-
if not items:
|
| 458 |
-
return "No links found on this page."
|
| 459 |
-
|
| 460 |
-
# --- Group by Internal vs External domains ---
|
| 461 |
-
base_netloc = urlparse(base_url).netloc
|
| 462 |
-
domain_groups: Dict[str, List[Dict[str, str]]] = {}
|
| 463 |
-
|
| 464 |
-
for it in items:
|
| 465 |
-
netloc = urlparse(it["url"]).netloc
|
| 466 |
-
key = "Internal Links" if netloc == base_netloc else f"External Links ({netloc})"
|
| 467 |
-
domain_groups.setdefault(key, []).append(it)
|
| 468 |
-
|
| 469 |
-
# --- Build Markdown with optional per-domain limit ---
|
| 470 |
-
total_links = len(items)
|
| 471 |
-
md_lines: List[str] = []
|
| 472 |
-
md_lines.append("# Sitemap")
|
| 473 |
-
md_lines.append(f"Base URL: {base_url}")
|
| 474 |
-
md_lines.append(f"Found {total_links} links:\n")
|
| 475 |
-
|
| 476 |
-
# Show Internal first, then external groups sorted by name
|
| 477 |
-
keys_sorted = ["Internal Links"] + sorted([k for k in domain_groups if k != "Internal Links"])
|
| 478 |
-
|
| 479 |
-
for group_key in keys_sorted:
|
| 480 |
-
if group_key not in domain_groups:
|
| 481 |
-
continue
|
| 482 |
-
|
| 483 |
-
group_links = domain_groups[group_key]
|
| 484 |
-
md_lines.append(f"## {group_key}\n")
|
| 485 |
-
|
| 486 |
-
if max_links_per_domain and max_links_per_domain > 0:
|
| 487 |
-
links_to_show = group_links[:max_links_per_domain]
|
| 488 |
-
remaining = max(0, len(group_links) - max_links_per_domain)
|
| 489 |
-
else:
|
| 490 |
-
links_to_show = group_links
|
| 491 |
-
remaining = 0
|
| 492 |
-
|
| 493 |
-
for link in links_to_show:
|
| 494 |
-
md_lines.append(f"- [{link['text']}]({link['url']})")
|
| 495 |
-
|
| 496 |
-
if remaining > 0:
|
| 497 |
-
md_lines.append(f"- ... and {remaining} more links")
|
| 498 |
-
|
| 499 |
-
md_lines.append("") # blank line after each group
|
| 500 |
-
|
| 501 |
-
sitemap_md = "\n".join(md_lines).strip()
|
| 502 |
-
return sitemap_md
|
| 503 |
-
|
| 504 |
-
|
| 505 |
# ======================================
|
| 506 |
# Code Execution: Python (MCP tool #6)
|
| 507 |
# ======================================
|
|
@@ -526,7 +402,7 @@ def Execute_Python(code: str) -> str:
|
|
| 526 |
|
| 527 |
|
| 528 |
# ======================
|
| 529 |
-
# UI:
|
| 530 |
# ======================
|
| 531 |
|
| 532 |
# --- Fetch tab (compact controllable extraction) ---
|
|
@@ -578,35 +454,7 @@ concise_interface = gr.Interface(
|
|
| 578 |
submit_btn="Search",
|
| 579 |
)
|
| 580 |
|
| 581 |
-
## Removed Structured and
|
| 582 |
-
|
| 583 |
-
# --- Generate Sitemap tab (LIMITED, grouped + optional per-domain cap) ---
|
| 584 |
-
sitemap_interface = gr.Interface(
|
| 585 |
-
fn=Generate_Sitemap,
|
| 586 |
-
inputs=[
|
| 587 |
-
gr.Textbox(
|
| 588 |
-
label="Website URL",
|
| 589 |
-
placeholder="https://example.com or example.com"
|
| 590 |
-
),
|
| 591 |
-
gr.Slider(
|
| 592 |
-
minimum=0,
|
| 593 |
-
maximum=1000,
|
| 594 |
-
value=0,
|
| 595 |
-
step=1,
|
| 596 |
-
label="Max links per domain (0 = show all)"
|
| 597 |
-
),
|
| 598 |
-
],
|
| 599 |
-
outputs=gr.Markdown(label="Sitemap (Markdown)"),
|
| 600 |
-
title="Generate Sitemap",
|
| 601 |
-
description="Group links by Internal/External domains; optionally limit links per domain.",
|
| 602 |
-
api_description=(
|
| 603 |
-
"Scan a page and build a grouped sitemap of anchor links. Links are grouped as "
|
| 604 |
-
"Internal or External (per domain). Set a per-domain cap; 0 shows all."
|
| 605 |
-
),
|
| 606 |
-
allow_flagging="never",
|
| 607 |
-
theme="Nymbo/Nymbo_Theme",
|
| 608 |
-
submit_btn="Generate",
|
| 609 |
-
)
|
| 610 |
|
| 611 |
# --- Execute Python tab (simple code interpreter) ---
|
| 612 |
code_interface = gr.Interface(
|
|
@@ -621,14 +469,13 @@ code_interface = gr.Interface(
|
|
| 621 |
|
| 622 |
# --- Combine all into a single app with tabs ---
|
| 623 |
demo = gr.TabbedInterface(
|
| 624 |
-
interface_list=[fetch_interface, concise_interface,
|
| 625 |
tab_names=[
|
| 626 |
"Fetch Webpage",
|
| 627 |
"DuckDuckGo Search",
|
| 628 |
-
"Generate Sitemap",
|
| 629 |
"Python Code Executor",
|
| 630 |
],
|
| 631 |
-
title="Web MCP — Fetch, Search,
|
| 632 |
theme="Nymbo/Nymbo_Theme",
|
| 633 |
)
|
| 634 |
|
|
|
|
| 1 |
# File: main/app.py
|
| 2 |
+
# Purpose: One Space that offers three tools/tabs:
|
| 3 |
# 1) Fetch — extract relevant page content (title, metadata, clean text, hyperlinks)
|
| 4 |
# 2) DuckDuckGo Search — compact JSONL search output (short keys to minimize tokens)
|
| 5 |
+
# 3) Python Code Executor — run Python code and capture stdout/errors
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
|
|
|
| 378 |
return "\n".join(lines)
|
| 379 |
|
| 380 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
# ======================================
|
| 382 |
# Code Execution: Python (MCP tool #6)
|
| 383 |
# ======================================
|
|
|
|
| 402 |
|
| 403 |
|
| 404 |
# ======================
|
| 405 |
+
# UI: three-tab interface
|
| 406 |
# ======================
|
| 407 |
|
| 408 |
# --- Fetch tab (compact controllable extraction) ---
|
|
|
|
| 454 |
submit_btn="Search",
|
| 455 |
)
|
| 456 |
|
| 457 |
+
## Removed Structured, Raw, and Sitemap tabs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
|
| 459 |
# --- Execute Python tab (simple code interpreter) ---
|
| 460 |
code_interface = gr.Interface(
|
|
|
|
| 469 |
|
| 470 |
# --- Combine all into a single app with tabs ---
|
| 471 |
demo = gr.TabbedInterface(
|
| 472 |
+
interface_list=[fetch_interface, concise_interface, code_interface],
|
| 473 |
tab_names=[
|
| 474 |
"Fetch Webpage",
|
| 475 |
"DuckDuckGo Search",
|
|
|
|
| 476 |
"Python Code Executor",
|
| 477 |
],
|
| 478 |
+
title="Web MCP — Fetch, Search, and Code Execution.",
|
| 479 |
theme="Nymbo/Nymbo_Theme",
|
| 480 |
)
|
| 481 |
|