Synacknetwork commited on
Commit
1beeec8
·
verified ·
1 Parent(s): 2031ad7

Upload app.py

Browse files

Updated app.py to show scrollable
Window with scan results/found results from keywords.

Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import random
5
+ import logging
6
+ import sqlite3
7
+ from fake_useragent import UserAgent
8
+ from urllib.parse import urljoin, urlparse
9
+ from urllib.robotparser import RobotFileParser
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ import gradio as gr
12
+ from threading import Lock
13
+
14
+ # User agent initialization
15
+ ua = UserAgent()
16
+ google_bot_ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
17
+
18
+ # Domain access times for rate-limiting
19
+ domain_access_times = {}
20
+
21
+ # Thread-safe visited URLs set
22
+ visited = set()
23
+ visited_lock = Lock()
24
+
25
+ # Setup logging
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format="%(asctime)s [%(levelname)s] %(message)s"
29
+ )
30
+
31
+ # Save to file
32
+ def save_to_file(filename, data):
33
+ with open(filename, "a") as file:
34
+ file.write(data + "\n")
35
+
36
+ # Save to database
37
+ def save_to_database(url, keywords_matched):
38
+ conn = sqlite3.connect("crawler.db")
39
+ cursor = conn.cursor()
40
+ cursor.execute("CREATE TABLE IF NOT EXISTS results (url TEXT, keywords TEXT)")
41
+ cursor.execute("INSERT INTO results (url, keywords) VALUES (?, ?)", (url, ",".join(keywords_matched)))
42
+ conn.commit()
43
+ conn.close()
44
+
45
+ # Get delay for rate-limiting
46
+ def get_delay(domain):
47
+ now = time.time()
48
+ if domain in domain_access_times:
49
+ elapsed = now - domain_access_times[domain]
50
+ delay = max(0, 5 - elapsed)
51
+ else:
52
+ delay = 0
53
+ domain_access_times[domain] = now
54
+ return delay
55
+
56
+ # Can crawl based on robots.txt
57
+ def can_crawl(url, user_agent):
58
+ parsed_url = "/".join(url.split("/")[:3]) + "/robots.txt"
59
+ rp = RobotFileParser()
60
+ rp.set_url(parsed_url)
61
+ try:
62
+ rp.read()
63
+ except Exception:
64
+ return True # Assume crawlable if robots.txt cannot be fetched
65
+ return rp.can_fetch(user_agent, url)
66
+
67
+ # Crawl function
68
+ def crawl(url, keywords, depth, found_urls, keywords_found):
69
+ if depth <= 0:
70
+ return "", ""
71
+
72
+ with visited_lock:
73
+ if url in visited:
74
+ return found_urls, keywords_found
75
+ visited.add(url)
76
+
77
+ domain = urlparse(url).netloc
78
+ time.sleep(get_delay(domain))
79
+
80
+ if not can_crawl(url, google_bot_ua):
81
+ logging.warning(f"Blocked by robots.txt: {url}")
82
+ return found_urls, keywords_found
83
+
84
+ for attempt in range(3): # Retry up to 3 times
85
+ try:
86
+ user_agent = google_bot_ua if random.random() < 0.2 else ua.random
87
+ headers = {"User-Agent": user_agent, "Referer": "https://www.google.com"}
88
+ response = requests.get(url, headers=headers, timeout=10)
89
+ response.raise_for_status()
90
+ break
91
+ except requests.exceptions.RequestException as e:
92
+ if attempt == 2:
93
+ logging.error(f"Failed after retries: {url} - {e}")
94
+ return found_urls, keywords_found
95
+ time.sleep(2 ** attempt) # Exponential backoff
96
+
97
+ save_to_file("found.txt", url)
98
+ logging.info(f"Crawled: {url}")
99
+ found_urls += f"{url}\n"
100
+
101
+ soup = BeautifulSoup(response.text, "html.parser")
102
+
103
+ # Check for keywords
104
+ text = soup.get_text().lower()
105
+ keywords_matched = [kw for kw in keywords if kw.lower() in text]
106
+ if keywords_matched:
107
+ save_to_file("keywords_found.txt", url)
108
+ save_to_database(url, keywords_matched)
109
+ logging.info(f"Keywords found in {url}: {keywords_matched}")
110
+ keywords_found += f"{url} - Keywords: {', '.join(keywords_matched)}\n"
111
+
112
+ # Find and crawl links
113
+ for link in soup.find_all("a", href=True):
114
+ next_url = urljoin(url, link["href"])
115
+ if next_url.startswith("http"):
116
+ found_urls, keywords_found = crawl(next_url, keywords, depth - 1, found_urls, keywords_found)
117
+
118
+ return found_urls, keywords_found
119
+
120
+ # Gradio interface function
121
+ def gradio_crawl(start_url, keywords, depth):
122
+ keywords_list = keywords.split(',')
123
+ found_urls = ""
124
+ keywords_found = ""
125
+ found_urls, keywords_found = crawl(start_url, keywords_list, depth, found_urls, keywords_found)
126
+ return found_urls, keywords_found
127
+
128
+ # Gradio UI setup
129
+ iface = gr.Interface(
130
+ fn=gradio_crawl,
131
+ inputs=[
132
+ gr.Textbox(label="Starting URL", placeholder="Enter the starting URL"),
133
+ gr.Textbox(label="Keywords (comma-separated)", placeholder="Enter keywords to search for"),
134
+ gr.Slider(label="Crawl Depth", minimum=1, maximum=5, step=1, value=3)
135
+ ],
136
+ outputs=[
137
+ gr.Textbox(label="Crawled URLs", lines=10, placeholder="Found URLs will be shown here...", interactive=False),
138
+ gr.Textbox(label="Keywords Found", lines=10, placeholder="URLs with matching keywords will be shown here...", interactive=False)
139
+ ],
140
+ live=True,
141
+ title="Web Crawler",
142
+ description="A simple web crawler that searches for keywords in websites."
143
+ )
144
+
145
+ # Launch the Gradio app
146
+ iface.launch()