import time import requests from bs4 import BeautifulSoup from urllib.parse import urlparse, urljoin from concurrent.futures import ThreadPoolExecutor class WebScraper: def __init__(self): pass def get_links(self,url: str, timeout=4): start = time.time() def get_links_from_page(url: str) -> list: response = requests.get(url) soup = BeautifulSoup(response.content, "lxml") anchors = soup.find_all("a") links = [] for anchor in anchors: if "href" in anchor.attrs: if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc: links.append(anchor.attrs["href"]) elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")): links.append(urljoin(url + "/", anchor.attrs["href"])) else: pass links = [link for link in links if "#" not in link] links = list(set(links)) else: continue return links links = get_links_from_page(url) unique_links = set() for link in links: now = time.time() if now - start > timeout: break else: unique_links = unique_links.union(set(get_links_from_page(link))) return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in unique_links]))