Synacknetwork commited on
Commit
0f7bfbc
·
verified ·
1 Parent(s): 364863c

Upload app.py

Browse files

Web crawler for saving any page that has your keywords on it, will go 5 deep, 10 threads.....

I made this for mainly to search for bigfoot and UFO/UAP videos for my YouTube channel works well, this is my first time using Gradio UI with python so I hope it works well...if not run it locally after removing Gradio stuff...enjoy!!

Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import random
5
+ import logging
6
+ import sqlite3
7
+ from fake_useragent import UserAgent
8
+ from urllib.parse import urljoin, urlparse
9
+ from urllib.robotparser import RobotFileParser
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ import gradio as gr
12
+ from threading import Lock
13
+
14
+ # User agent initialization
15
+ ua = UserAgent()
16
+ google_bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
17
+
18
+ # Domain access times for rate-limiting
19
+ domain_access_times = {}
20
+
21
+ # Thread-safe visited URLs set
22
+ visited = set()
23
+ visited_lock = Lock()
24
+
25
+ # Setup logging
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format="%(asctime)s [%(levelname)s] %(message)s"
29
+ )
30
+
31
+ # Save to file
32
+ def save_to_file(filename, data):
33
+ with open(filename, "a") as file:
34
+ file.write(data + "\n")
35
+
36
+ # Save to database
37
+ def save_to_database(url, keywords_matched):
38
+ conn = sqlite3.connect("crawler.db")
39
+ cursor = conn.cursor()
40
+ cursor.execute("CREATE TABLE IF NOT EXISTS results (url TEXT, keywords TEXT)")
41
+ cursor.execute("INSERT INTO results (url, keywords) VALUES (?, ?)", (url, ",".join(keywords_matched)))
42
+ conn.commit()
43
+ conn.close()
44
+
45
+ # Get delay for rate-limiting
46
+ def get_delay(domain):
47
+ now = time.time()
48
+ if domain in domain_access_times:
49
+ elapsed = now - domain_access_times[domain]
50
+ delay = max(0, 5 - elapsed)
51
+ else:
52
+ delay = 0
53
+ domain_access_times[domain] = now
54
+ return delay
55
+
56
+ # Can crawl based on robots.txt
57
+ def can_crawl(url, user_agent):
58
+ parsed_url = "/".join(url.split("/")[:3]) + "/robots.txt"
59
+ rp = RobotFileParser()
60
+ rp.set_url(parsed_url)
61
+ try:
62
+ rp.read()
63
+ except Exception:
64
+ return True # Assume crawlable if robots.txt cannot be fetched
65
+ return rp.can_fetch(user_agent, url)
66
+
67
+ # Crawl function
68
+ def crawl(url, keywords, depth):
69
+ if depth <= 0:
70
+ return ""
71
+
72
+ with visited_lock:
73
+ if url in visited:
74
+ return ""
75
+ visited.add(url)
76
+
77
+ domain = urlparse(url).netloc
78
+ time.sleep(get_delay(domain))
79
+
80
+ if not can_crawl(url, google_bot_ua):
81
+ logging.warning(f"Blocked by robots.txt: {url}")
82
+ return ""
83
+
84
+ for attempt in range(3): # Retry up to 3 times
85
+ try:
86
+ user_agent = google_bot_ua if random.random() < 0.2 else ua.random
87
+ headers = {"User-Agent": user_agent, "Referer": "https://www.google.com"}
88
+ response = requests.get(url, headers=headers, timeout=10)
89
+ response.raise_for_status()
90
+ break
91
+ except requests.exceptions.RequestException as e:
92
+ if attempt == 2:
93
+ logging.error(f"Failed after retries: {url} - {e}")
94
+ return ""
95
+ time.sleep(2 ** attempt) # Exponential backoff
96
+
97
+ save_to_file("found.txt", url)
98
+ logging.info(f"Crawled: {url}")
99
+
100
+ soup = BeautifulSoup(response.text, "html.parser")
101
+
102
+ # Check for keywords
103
+ text = soup.get_text().lower()
104
+ keywords_matched = [kw for kw in keywords if kw.lower() in text]
105
+ if keywords_matched:
106
+ save_to_file("keywords_found.txt", url)
107
+ save_to_database(url, keywords_matched)
108
+ logging.info(f"Keywords found in {url}: {keywords_matched}")
109
+
110
+ # Find and crawl links
111
+ for link in soup.find_all("a", href=True):
112
+ next_url = urljoin(url, link["href"])
113
+ if next_url.startswith("http"):
114
+ crawl(next_url, keywords, depth - 1)
115
+
116
+ return f"Crawled: {url}, Keywords: {keywords_matched}"
117
+
118
+ # Gradio interface function
119
+ def gradio_crawl(start_url, keywords, depth):
120
+ keywords_list = keywords.split(',')
121
+ result = crawl(start_url, keywords_list, depth)
122
+ return result
123
+
124
+ # Gradio UI setup
125
+ iface = gr.Interface(
126
+ fn=gradio_crawl,
127
+ inputs=[
128
+ gr.Textbox(label="Starting URL", placeholder="Enter the starting URL"),
129
+ gr.Textbox(label="Keywords (comma-separated)", placeholder="Enter keywords to search for"),
130
+ gr.Slider(label="Crawl Depth", minimum=1, maximum=5, step=1, value=3)
131
+ ],
132
+ outputs="text",
133
+ live=True,
134
+ title="Web Crawler",
135
+ description="A simple web crawler that searches for keywords in websites."
136
+ )
137
+
138
+ # Launch the Gradio app
139
+ iface.launch()