Spaces:

Synacknetwork
/

WebcrawlPLUS

Sleeping

App Files Files Community

Synacknetwork commited on Jan 2

Commit

1beeec8

verified ·

1 Parent(s): 2031ad7

Upload app.py

Browse files

Updated app.py to show scrollable
Window with scan results/found results from keywords.

Files changed (1) hide show

app.py +146 -0

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import requests
+from bs4 import BeautifulSoup
+import time
+import random
+import logging
+import sqlite3
+from fake_useragent import UserAgent
+from urllib.parse import urljoin, urlparse
+from urllib.robotparser import RobotFileParser
+from concurrent.futures import ThreadPoolExecutor
+import gradio as gr
+from threading import Lock
+# User agent initialization
+ua = UserAgent()
+google_bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
+# Domain access times for rate-limiting
+domain_access_times = {}
+# Thread-safe visited URLs set
+visited = set()
+visited_lock = Lock()
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s"
+)
+# Save to file
+def save_to_file(filename, data):
+    with open(filename, "a") as file:
+        file.write(data + "\n")
+# Save to database
+def save_to_database(url, keywords_matched):
+    conn = sqlite3.connect("crawler.db")
+    cursor = conn.cursor()
+    cursor.execute("CREATE TABLE IF NOT EXISTS results (url TEXT, keywords TEXT)")
+    cursor.execute("INSERT INTO results (url, keywords) VALUES (?, ?)", (url, ",".join(keywords_matched)))
+    conn.commit()
+    conn.close()
+# Get delay for rate-limiting
+def get_delay(domain):
+    now = time.time()
+    if domain in domain_access_times:
+        elapsed = now - domain_access_times[domain]
+        delay = max(0, 5 - elapsed)
+    else:
+        delay = 0
+    domain_access_times[domain] = now
+    return delay
+# Can crawl based on robots.txt
+def can_crawl(url, user_agent):
+    parsed_url = "/".join(url.split("/")[:3]) + "/robots.txt"
+    rp = RobotFileParser()
+    rp.set_url(parsed_url)
+    try:
+        rp.read()
+    except Exception:
+        return True  # Assume crawlable if robots.txt cannot be fetched
+    return rp.can_fetch(user_agent, url)
+# Crawl function
+def crawl(url, keywords, depth, found_urls, keywords_found):
+    if depth <= 0:
+        return "", ""
+    with visited_lock:
+        if url in visited:
+            return found_urls, keywords_found
+        visited.add(url)
+    domain = urlparse(url).netloc
+    time.sleep(get_delay(domain))
+    if not can_crawl(url, google_bot_ua):
+        logging.warning(f"Blocked by robots.txt: {url}")
+        return found_urls, keywords_found
+    for attempt in range(3):  # Retry up to 3 times
+        try:
+            user_agent = google_bot_ua if random.random() < 0.2 else ua.random
+            headers = {"User-Agent": user_agent, "Referer": "https://www.google.com"}
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            break
+        except requests.exceptions.RequestException as e:
+            if attempt == 2:
+                logging.error(f"Failed after retries: {url} - {e}")
+                return found_urls, keywords_found
+            time.sleep(2 ** attempt)  # Exponential backoff
+    save_to_file("found.txt", url)
+    logging.info(f"Crawled: {url}")
+    found_urls += f"{url}\n"
+    soup = BeautifulSoup(response.text, "html.parser")
+    # Check for keywords
+    text = soup.get_text().lower()
+    keywords_matched = [kw for kw in keywords if kw.lower() in text]
+    if keywords_matched:
+        save_to_file("keywords_found.txt", url)
+        save_to_database(url, keywords_matched)
+        logging.info(f"Keywords found in {url}: {keywords_matched}")
+        keywords_found += f"{url} - Keywords: {', '.join(keywords_matched)}\n"
+    # Find and crawl links
+    for link in soup.find_all("a", href=True):
+        next_url = urljoin(url, link["href"])
+        if next_url.startswith("http"):
+            found_urls, keywords_found = crawl(next_url, keywords, depth - 1, found_urls, keywords_found)
+    return found_urls, keywords_found
+# Gradio interface function
+def gradio_crawl(start_url, keywords, depth):
+    keywords_list = keywords.split(',')
+    found_urls = ""
+    keywords_found = ""
+    found_urls, keywords_found = crawl(start_url, keywords_list, depth, found_urls, keywords_found)
+    return found_urls, keywords_found
+# Gradio UI setup
+iface = gr.Interface(
+    fn=gradio_crawl,
+    inputs=[
+        gr.Textbox(label="Starting URL", placeholder="Enter the starting URL"),
+        gr.Textbox(label="Keywords (comma-separated)", placeholder="Enter keywords to search for"),
+        gr.Slider(label="Crawl Depth", minimum=1, maximum=5, step=1, value=3)
+    ],
+    outputs=[
+        gr.Textbox(label="Crawled URLs", lines=10, placeholder="Found URLs will be shown here...", interactive=False),
+        gr.Textbox(label="Keywords Found", lines=10, placeholder="URLs with matching keywords will be shown here...", interactive=False)
+    ],
+    live=True,
+    title="Web Crawler",
+    description="A simple web crawler that searches for keywords in websites."
+)
+# Launch the Gradio app
+iface.launch()