Spaces:

puzan8932
/

Project

Runtime error

Project / core /services /get_links /web_scraper.py

updated

ad87194 5 months ago

1.54 kB

	import time
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse, urljoin
	from concurrent.futures import ThreadPoolExecutor

	class WebScraper:
	def __init__(self):
	pass

	def get_links(self,url: str, timeout=4):
	start = time.time()

	def get_links_from_page(url: str) -> list:
	response = requests.get(url)
	soup = BeautifulSoup(response.content, "lxml")
	anchors = soup.find_all("a")
	links = []
	for anchor in anchors:
	if "href" in anchor.attrs:
	if urlparse(anchor.attrs["href"]).netloc == urlparse(url).netloc:
	links.append(anchor.attrs["href"])
	elif not anchor.attrs["href"].startswith(("//", "file", "javascript", "tel", "mailto", "http")):
	links.append(urljoin(url + "/", anchor.attrs["href"]))
	else:
	pass
	links = [link for link in links if "#" not in link]
	links = list(set(links))
	else:
	continue
	return links

	links = get_links_from_page(url)
	unique_links = set()
	for link in links:
	now = time.time()
	if now - start > timeout:
	break
	else:
	unique_links = unique_links.union(set(get_links_from_page(link)))
	return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in unique_links]))