ShubhamMhaske commited on
Commit
0bca85a
·
verified ·
1 Parent(s): eeffdba

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import threading
4
+ import queue
5
+ import time
6
+ from bs4 import BeautifulSoup
7
+ from urllib.parse import urljoin, urlparse
8
+ from concurrent.futures import ThreadPoolExecutor
9
+
10
+ # Streamlit UI
11
+ st.title("Web Crawler - Link Checker")
12
+
13
+ BASE_URL = st.text_input("Enter the Base URL:", "https://www.example.com/")
14
+ MAX_CONCURRENT_THREAD = st.slider("Select Max Concurrent Threads", 1, 20, 10)
15
+ LOGGING = st.checkbox("Enable Logging", True)
16
+
17
+ start_crawling = st.button("Start Crawling")
18
+
19
+ # Global Variables
20
+ relativeLinks = queue.Queue()
21
+ visitedLinks = set()
22
+
23
+ # File Output Data
24
+ visited_links = []
25
+ ok_links = []
26
+ error_links = []
27
+ exception_links = []
28
+
29
+ session = requests.Session() # Reuse session for connection pooling
30
+ session.headers.update({"User-Agent": "Mozilla/5.0"}) # Set a common User-Agent
31
+
32
+ def link_checker(source, address):
33
+ """Check the validity of a link."""
34
+ try:
35
+ resp = session.get(address, timeout=5)
36
+ if resp.status_code in [400, 404, 403, 408, 409, 501, 502, 503]:
37
+ error_links.append(f"[{source}] {resp.status_code} {resp.reason} --> {address}")
38
+ else:
39
+ ok_links.append(f"[{source}] OK --> {address}")
40
+ except requests.RequestException as e:
41
+ exception_links.append(f"[{source}] Exception --> {e} {address}")
42
+
43
+ def normalize_url(a):
44
+ """Normalize URLs to absolute format."""
45
+ if a.startswith("#") or a.startswith("javascript:") or a.startswith("mailto:") or a.startswith("tel:"):
46
+ return None
47
+ return urljoin(BASE_URL, a).rstrip('/')
48
+
49
+ def link_extractor(address):
50
+ """Extract links from a webpage and check their validity."""
51
+ visited_links.append(address)
52
+ try:
53
+ res = session.get(address, timeout=5)
54
+ soup = BeautifulSoup(res.text, 'html.parser')
55
+
56
+ extracted_links = []
57
+ for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src'), ('a', 'href')]:
58
+ extracted_links.extend(
59
+ normalize_url(link[attr]) for link in soup.find_all(tag) if link.has_attr(attr)
60
+ )
61
+
62
+ extracted_links = set(filter(None, extracted_links)) # Remove duplicates and None values
63
+
64
+ with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_THREAD) as executor:
65
+ executor.map(lambda p: link_checker(source=address, address=p), extracted_links)
66
+
67
+ for p in extracted_links:
68
+ if p.startswith(BASE_URL) and p not in visitedLinks:
69
+ visitedLinks.add(p)
70
+ relativeLinks.put(p)
71
+ except requests.RequestException as e:
72
+ exception_links.append(f"[RELATIVE LINK] Exception --> {e} {address}")
73
+
74
+ def threader():
75
+ """Worker function for handling queued links."""
76
+ while not relativeLinks.empty():
77
+ value = relativeLinks.get()
78
+ if value:
79
+ print(f"Checking: {value} | Remaining: {relativeLinks.qsize()}")
80
+ link_extractor(value)
81
+ relativeLinks.task_done()
82
+
83
+ if start_crawling:
84
+ start_time = time.time()
85
+ visitedLinks.add(BASE_URL.strip())
86
+ relativeLinks.put(BASE_URL.strip())
87
+
88
+ with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_THREAD) as executor:
89
+ executor.map(lambda _: threader(), range(MAX_CONCURRENT_THREAD))
90
+
91
+ end_time = time.time()
92
+ elapsed_time = end_time - start_time
93
+
94
+ # Display Results
95
+ st.success("Crawling Completed!")
96
+ st.write(f"Total Time Taken: {elapsed_time:.2f} seconds")
97
+ st.write(f"Total Links Visited: {len(visited_links)}")
98
+ st.write(f"Total OK Links: {len(ok_links)}")
99
+ st.write(f"Total Error Links: {len(error_links)}")
100
+ st.write(f"Total Exception Links: {len(exception_links)}")
101
+
102
+ # Display Logs
103
+ with st.expander("Visited Links"):
104
+ st.write(visited_links)
105
+ with st.expander("OK Links"):
106
+ st.write(ok_links)
107
+ with st.expander("Error Links"):
108
+ st.write(error_links)
109
+ with st.expander("Exception Links"):
110
+ st.write(exception_links)