Spaces:
Sleeping
Sleeping
File size: 4,218 Bytes
0f7bfbc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
import sqlite3
from fake_useragent import UserAgent
from urllib.parse import urljoin, urlparse
from urllib.robotparser import RobotFileParser
from concurrent.futures import ThreadPoolExecutor
import gradio as gr
from threading import Lock
# User agent initialization
ua = UserAgent()
google_bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
# Domain access times for rate-limiting
domain_access_times = {}
# Thread-safe visited URLs set
visited = set()
visited_lock = Lock()
# Setup logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s"
)
# Save to file
def save_to_file(filename, data):
with open(filename, "a") as file:
file.write(data + "\n")
# Save to database
def save_to_database(url, keywords_matched):
conn = sqlite3.connect("crawler.db")
cursor = conn.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS results (url TEXT, keywords TEXT)")
cursor.execute("INSERT INTO results (url, keywords) VALUES (?, ?)", (url, ",".join(keywords_matched)))
conn.commit()
conn.close()
# Get delay for rate-limiting
def get_delay(domain):
now = time.time()
if domain in domain_access_times:
elapsed = now - domain_access_times[domain]
delay = max(0, 5 - elapsed)
else:
delay = 0
domain_access_times[domain] = now
return delay
# Can crawl based on robots.txt
def can_crawl(url, user_agent):
parsed_url = "/".join(url.split("/")[:3]) + "/robots.txt"
rp = RobotFileParser()
rp.set_url(parsed_url)
try:
rp.read()
except Exception:
return True # Assume crawlable if robots.txt cannot be fetched
return rp.can_fetch(user_agent, url)
# Crawl function
def crawl(url, keywords, depth):
if depth <= 0:
return ""
with visited_lock:
if url in visited:
return ""
visited.add(url)
domain = urlparse(url).netloc
time.sleep(get_delay(domain))
if not can_crawl(url, google_bot_ua):
logging.warning(f"Blocked by robots.txt: {url}")
return ""
for attempt in range(3): # Retry up to 3 times
try:
user_agent = google_bot_ua if random.random() < 0.2 else ua.random
headers = {"User-Agent": user_agent, "Referer": "https://www.google.com"}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
break
except requests.exceptions.RequestException as e:
if attempt == 2:
logging.error(f"Failed after retries: {url} - {e}")
return ""
time.sleep(2 ** attempt) # Exponential backoff
save_to_file("found.txt", url)
logging.info(f"Crawled: {url}")
soup = BeautifulSoup(response.text, "html.parser")
# Check for keywords
text = soup.get_text().lower()
keywords_matched = [kw for kw in keywords if kw.lower() in text]
if keywords_matched:
save_to_file("keywords_found.txt", url)
save_to_database(url, keywords_matched)
logging.info(f"Keywords found in {url}: {keywords_matched}")
# Find and crawl links
for link in soup.find_all("a", href=True):
next_url = urljoin(url, link["href"])
if next_url.startswith("http"):
crawl(next_url, keywords, depth - 1)
return f"Crawled: {url}, Keywords: {keywords_matched}"
# Gradio interface function
def gradio_crawl(start_url, keywords, depth):
keywords_list = keywords.split(',')
result = crawl(start_url, keywords_list, depth)
return result
# Gradio UI setup
iface = gr.Interface(
fn=gradio_crawl,
inputs=[
gr.Textbox(label="Starting URL", placeholder="Enter the starting URL"),
gr.Textbox(label="Keywords (comma-separated)", placeholder="Enter keywords to search for"),
gr.Slider(label="Crawl Depth", minimum=1, maximum=5, step=1, value=3)
],
outputs="text",
live=True,
title="Web Crawler",
description="A simple web crawler that searches for keywords in websites."
)
# Launch the Gradio app
iface.launch() |