File size: 1,541 Bytes
ad87194 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor
class WebScraper:
def __init__(self):
pass
def get_links(self,url: str, timeout=4):
start = time.time()
def get_links_from_page(url: str) -> list:
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")
anchors = soup.find_all("a")
links = []
for anchor in anchors:
if "href" in anchor.attrs:
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
links.append(anchor.attrs["href"])
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
links.append(urljoin(url + "/", anchor.attrs["href"]))
else:
pass
links = [link for link in links if "#" not in link]
links = list(set(links))
else:
continue
return links
links = get_links_from_page(url)
unique_links = set()
for link in links:
now = time.time()
if now - start > timeout:
break
else:
unique_links = unique_links.union(set(get_links_from_page(link)))
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in unique_links]))
|