File size: 1,541 Bytes
ad87194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor

class WebScraper:
    def __init__(self):
        pass

    def get_links(self,url: str, timeout=4):
        start = time.time()

        def get_links_from_page(url: str) -> list:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, "lxml")
            anchors = soup.find_all("a")
            links = []
            for anchor in anchors:
                if "href" in anchor.attrs:
                    if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
                        links.append(anchor.attrs["href"])
                    elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
                        links.append(urljoin(url + "/", anchor.attrs["href"]))
                    else:
                        pass
                    links = [link for link in links if "#" not in link]
                    links = list(set(links))
                else:
                    continue
            return links

        links = get_links_from_page(url)
        unique_links = set()
        for link in links:
            now = time.time()
            if now - start > timeout:
                break
            else:
                unique_links = unique_links.union(set(get_links_from_page(link)))
        return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in unique_links]))