Project / core /services /get_links /web_scraper.py
puzan789's picture
updated
ad87194
raw
history blame
1.54 kB
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from concurrent.futures import ThreadPoolExecutor
class WebScraper:
def __init__(self):
pass
def get_links(self,url: str, timeout=4):
start = time.time()
def get_links_from_page(url: str) -> list:
response = requests.get(url)
soup = BeautifulSoup(response.content, "lxml")
anchors = soup.find_all("a")
links = []
for anchor in anchors:
if "href" in anchor.attrs:
if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
links.append(anchor.attrs["href"])
elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
links.append(urljoin(url + "/", anchor.attrs["href"]))
else:
pass
links = [link for link in links if "#" not in link]
links = list(set(links))
else:
continue
return links
links = get_links_from_page(url)
unique_links = set()
for link in links:
now = time.time()
if now - start > timeout:
break
else:
unique_links = unique_links.union(set(get_links_from_page(link)))
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in unique_links]))