|
import time |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse, urljoin |
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
class WebScraper: |
|
def __init__(self): |
|
pass |
|
|
|
def get_links(self,url: str, timeout=4): |
|
start = time.time() |
|
|
|
def get_links_from_page(url: str) -> list: |
|
response = requests.get(url) |
|
soup = BeautifulSoup(response.content, "lxml") |
|
anchors = soup.find_all("a") |
|
links = [] |
|
for anchor in anchors: |
|
if "href" in anchor.attrs: |
|
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc: |
|
links.append(anchor.attrs["href"]) |
|
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")): |
|
links.append(urljoin(url + "/", anchor.attrs["href"])) |
|
else: |
|
pass |
|
links = [link for link in links if "#" not in link] |
|
links = list(set(links)) |
|
else: |
|
continue |
|
return links |
|
|
|
links = get_links_from_page(url) |
|
unique_links = set() |
|
for link in links: |
|
now = time.time() |
|
if now - start > timeout: |
|
break |
|
else: |
|
unique_links = unique_links.union(set(get_links_from_page(link))) |
|
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in unique_links])) |
|
|
|
|