Spaces:
Sleeping
Sleeping
Upload app.py
Browse filesWeb crawler for saving any page that has your keywords on it, will go 5 deep, 10 threads.....
I made this for mainly to search for bigfoot and UFO/UAP videos for my YouTube channel works well, this is my first time using Gradio UI with python so I hope it works well...if not run it locally after removing Gradio stuff...enjoy!!
app.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import time
|
4 |
+
import random
|
5 |
+
import logging
|
6 |
+
import sqlite3
|
7 |
+
from fake_useragent import UserAgent
|
8 |
+
from urllib.parse import urljoin, urlparse
|
9 |
+
from urllib.robotparser import RobotFileParser
|
10 |
+
from concurrent.futures import ThreadPoolExecutor
|
11 |
+
import gradio as gr
|
12 |
+
from threading import Lock
|
13 |
+
|
14 |
+
# User agent initialization
|
15 |
+
ua = UserAgent()
|
16 |
+
google_bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
17 |
+
|
18 |
+
# Domain access times for rate-limiting
|
19 |
+
domain_access_times = {}
|
20 |
+
|
21 |
+
# Thread-safe visited URLs set
|
22 |
+
visited = set()
|
23 |
+
visited_lock = Lock()
|
24 |
+
|
25 |
+
# Setup logging
|
26 |
+
logging.basicConfig(
|
27 |
+
level=logging.INFO,
|
28 |
+
format="%(asctime)s [%(levelname)s] %(message)s"
|
29 |
+
)
|
30 |
+
|
31 |
+
# Save to file
|
32 |
+
def save_to_file(filename, data):
|
33 |
+
with open(filename, "a") as file:
|
34 |
+
file.write(data + "\n")
|
35 |
+
|
36 |
+
# Save to database
|
37 |
+
def save_to_database(url, keywords_matched):
|
38 |
+
conn = sqlite3.connect("crawler.db")
|
39 |
+
cursor = conn.cursor()
|
40 |
+
cursor.execute("CREATE TABLE IF NOT EXISTS results (url TEXT, keywords TEXT)")
|
41 |
+
cursor.execute("INSERT INTO results (url, keywords) VALUES (?, ?)", (url, ",".join(keywords_matched)))
|
42 |
+
conn.commit()
|
43 |
+
conn.close()
|
44 |
+
|
45 |
+
# Get delay for rate-limiting
|
46 |
+
def get_delay(domain):
|
47 |
+
now = time.time()
|
48 |
+
if domain in domain_access_times:
|
49 |
+
elapsed = now - domain_access_times[domain]
|
50 |
+
delay = max(0, 5 - elapsed)
|
51 |
+
else:
|
52 |
+
delay = 0
|
53 |
+
domain_access_times[domain] = now
|
54 |
+
return delay
|
55 |
+
|
56 |
+
# Can crawl based on robots.txt
|
57 |
+
def can_crawl(url, user_agent):
|
58 |
+
parsed_url = "/".join(url.split("/")[:3]) + "/robots.txt"
|
59 |
+
rp = RobotFileParser()
|
60 |
+
rp.set_url(parsed_url)
|
61 |
+
try:
|
62 |
+
rp.read()
|
63 |
+
except Exception:
|
64 |
+
return True # Assume crawlable if robots.txt cannot be fetched
|
65 |
+
return rp.can_fetch(user_agent, url)
|
66 |
+
|
67 |
+
# Crawl function
|
68 |
+
def crawl(url, keywords, depth):
|
69 |
+
if depth <= 0:
|
70 |
+
return ""
|
71 |
+
|
72 |
+
with visited_lock:
|
73 |
+
if url in visited:
|
74 |
+
return ""
|
75 |
+
visited.add(url)
|
76 |
+
|
77 |
+
domain = urlparse(url).netloc
|
78 |
+
time.sleep(get_delay(domain))
|
79 |
+
|
80 |
+
if not can_crawl(url, google_bot_ua):
|
81 |
+
logging.warning(f"Blocked by robots.txt: {url}")
|
82 |
+
return ""
|
83 |
+
|
84 |
+
for attempt in range(3): # Retry up to 3 times
|
85 |
+
try:
|
86 |
+
user_agent = google_bot_ua if random.random() < 0.2 else ua.random
|
87 |
+
headers = {"User-Agent": user_agent, "Referer": "https://www.google.com"}
|
88 |
+
response = requests.get(url, headers=headers, timeout=10)
|
89 |
+
response.raise_for_status()
|
90 |
+
break
|
91 |
+
except requests.exceptions.RequestException as e:
|
92 |
+
if attempt == 2:
|
93 |
+
logging.error(f"Failed after retries: {url} - {e}")
|
94 |
+
return ""
|
95 |
+
time.sleep(2 ** attempt) # Exponential backoff
|
96 |
+
|
97 |
+
save_to_file("found.txt", url)
|
98 |
+
logging.info(f"Crawled: {url}")
|
99 |
+
|
100 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
101 |
+
|
102 |
+
# Check for keywords
|
103 |
+
text = soup.get_text().lower()
|
104 |
+
keywords_matched = [kw for kw in keywords if kw.lower() in text]
|
105 |
+
if keywords_matched:
|
106 |
+
save_to_file("keywords_found.txt", url)
|
107 |
+
save_to_database(url, keywords_matched)
|
108 |
+
logging.info(f"Keywords found in {url}: {keywords_matched}")
|
109 |
+
|
110 |
+
# Find and crawl links
|
111 |
+
for link in soup.find_all("a", href=True):
|
112 |
+
next_url = urljoin(url, link["href"])
|
113 |
+
if next_url.startswith("http"):
|
114 |
+
crawl(next_url, keywords, depth - 1)
|
115 |
+
|
116 |
+
return f"Crawled: {url}, Keywords: {keywords_matched}"
|
117 |
+
|
118 |
+
# Gradio interface function
|
119 |
+
def gradio_crawl(start_url, keywords, depth):
|
120 |
+
keywords_list = keywords.split(',')
|
121 |
+
result = crawl(start_url, keywords_list, depth)
|
122 |
+
return result
|
123 |
+
|
124 |
+
# Gradio UI setup
|
125 |
+
iface = gr.Interface(
|
126 |
+
fn=gradio_crawl,
|
127 |
+
inputs=[
|
128 |
+
gr.Textbox(label="Starting URL", placeholder="Enter the starting URL"),
|
129 |
+
gr.Textbox(label="Keywords (comma-separated)", placeholder="Enter keywords to search for"),
|
130 |
+
gr.Slider(label="Crawl Depth", minimum=1, maximum=5, step=1, value=3)
|
131 |
+
],
|
132 |
+
outputs="text",
|
133 |
+
live=True,
|
134 |
+
title="Web Crawler",
|
135 |
+
description="A simple web crawler that searches for keywords in websites."
|
136 |
+
)
|
137 |
+
|
138 |
+
# Launch the Gradio app
|
139 |
+
iface.launch()
|