WebcrawlPLUS / app.py
Synacknetwork's picture
Update app.py
229fe87 verified
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
import sqlite3
from fake_useragent import UserAgent
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from concurrent.futures import ThreadPoolExecutor
import gradio as gr
from threading import Lock
# User agent initialization
ua = UserAgent()
google_bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
# Domain access times for rate-limiting
domain_access_times = {}
# Thread-safe visited URLs set
visited = set()
visited_lock = Lock()
# Setup logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s"
)
# Save to file
def save_to_file(filename, data):
with open(filename, "a") as file:
file.write(data + "\n")
# Save to database
def save_to_database(url, keywords_matched):
conn = sqlite3.connect("crawler.db")
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS results (url TEXT, keywords TEXT)")
cursor.execute("INSERT INTO results (url, keywords) VALUES (?, ?)", (url, ",".join(keywords_matched)))
conn.commit()
conn.close()
# Get delay for rate-limiting
def get_delay(domain):
now = time.time()
if domain in domain_access_times:
elapsed = now - domain_access_times[domain]
delay = max(0, 5 - elapsed)
else:
delay = 0
domain_access_times[domain] = now
return delay
# Can crawl based on robots.txt
def can_crawl(url, user_agent):
parsed_url = "/".join(url.split("/")[:3]) + "/robots.txt"
rp = RobotFileParser()
rp.set_url(parsed_url)
try:
rp.read()
except Exception:
return True # Assume crawlable if robots.txt cannot be fetched
return rp.can_fetch(user_agent, url)
# Crawl function
def crawl(url, keywords, depth, found_urls, keywords_found):
if depth <= 0:
return "", ""
with visited_lock:
if url in visited:
return found_urls, keywords_found
visited.add(url)
domain = urlparse(url).netloc
time.sleep(get_delay(domain))
if not can_crawl(url, google_bot_ua):
logging.warning(f"Blocked by robots.txt: {url}")
return found_urls, keywords_found
for attempt in range(3): # Retry up to 3 times
try:
user_agent = google_bot_ua if random.random() < 0.2 else ua.random
headers = {"User-Agent": user_agent, "Referer": "https://www.google.com"}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
break
except requests.exceptions.RequestException as e:
if attempt == 2:
logging.error(f"Failed after retries: {url} - {e}")
return found_urls, keywords_found
time.sleep(2 ** attempt) # Exponential backoff
save_to_file("found.txt", url)
logging.info(f"Crawled: {url}")
found_urls += f"{url}\n"
soup = BeautifulSoup(response.text, "html.parser")
# Check for keywords
text = soup.get_text().lower()
keywords_matched = [kw for kw in keywords if kw.lower() in text]
if keywords_matched:
save_to_file("keywords_found.txt", url)
save_to_database(url, keywords_matched)
logging.info(f"Keywords found in {url}: {keywords_matched}")
keywords_found += f"{url} - Keywords: {', '.join(keywords_matched)}\n"
# Find and crawl links
for link in soup.find_all("a", href=True):
next_url = urljoin(url, link["href"])
if next_url.startswith("http"):
found_urls, keywords_found = crawl(next_url, keywords, depth - 1, found_urls, keywords_found)
return found_urls, keywords_found
# Gradio interface function
def gradio_crawl(start_url, keywords, depth):
keywords_list = keywords.split(',')
found_urls = ""
keywords_found = ""
found_urls, keywords_found = crawl(start_url, keywords_list, depth, found_urls, keywords_found)
return found_urls, keywords_found
# Gradio UI setup
iface = gr.Interface(
fn=gradio_crawl,
inputs=[
gr.Textbox(label="Starting URL", placeholder="Enter the starting URL"),
gr.Textbox(label="Keywords (comma-separated)", placeholder="Enter keywords to search for"),
gr.Slider(label="Crawl Depth", minimum=1, maximum=5, step=1, value=3)
],
outputs=[
gr.Textbox(label="Crawled URLs", lines=10, placeholder="Found URLs will be shown here...", interactive=False),
gr.Textbox(label="Keywords Found", lines=10, placeholder="URLs with matching keywords will be shown here...", interactive=False)
],
live=True,
title="WebCrawlPLUS",
description="A webcrawler from SynAckNetwork.com"
)
# Launch the Gradio app
iface.launch()