Spaces:
Running
Running
File size: 3,247 Bytes
e92817a b259fec cf4c3f8 b259fec af2c647 cf4c3f8 b259fec 62ee9e4 b259fec cf4c3f8 b259fec af2c647 b259fec 7d44e75 b259fec 62ee9e4 b259fec cf4c3f8 62ee9e4 cf4c3f8 7d44e75 b259fec e92817a b259fec e92817a b259fec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import concurrent.futures
import requests
import tldextract
from pathlib import Path
from utils.enver import enver
from utils.logger import logger
from networks.filepath_converter import UrlToFilepathConverter
from networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
class WebpageFetcher:
def __init__(self):
self.enver = enver
self.enver.set_envs(proxies=True)
self.filepath_converter = UrlToFilepathConverter()
def is_ignored_host(self, url):
self.host = tldextract.extract(url).registered_domain
if self.host in IGNORE_HOSTS:
return True
else:
return False
def send_request(self):
self.request_response = requests.get(
url=self.url,
headers=REQUESTS_HEADERS,
proxies=self.enver.requests_proxies,
)
def save_response(self):
if not self.output_path.exists():
self.output_path.parent.mkdir(parents=True, exist_ok=True)
logger.success(f"Saving to: [{self.output_path}]")
with open(self.output_path, "wb") as wf:
wf.write(self.request_response.content)
def fetch(self, url, overwrite=False, output_parent=None):
self.url = url
logger.note(f"Fetching: [{self.url}]")
self.output_path = self.filepath_converter.convert(
self.url, parent=output_parent
)
if self.is_ignored_host(self.url):
logger.warn(f"Ignore host: [{self.host}]")
return self.output_path
if self.output_path.exists() and not overwrite:
logger.success(f"HTML existed: [{self.output_path}]")
else:
self.send_request()
self.save_response()
return self.output_path
class BatchWebpageFetcher:
def __init__(self):
self.done_count = 0
self.total_count = 0
def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
webpage_fetcher = WebpageFetcher()
webpage_fetcher.fetch(url=url, overwrite=overwrite, output_parent=output_parent)
self.done_count += 1
logger.success(f"> {self.done_count}/{self.total_count}: {url}")
def fetch(self, urls, overwrite=False, output_parent=None):
self.urls = urls
self.total_count = len(self.urls)
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [
executor.submit(
self.fecth_single_webpage,
url=url,
overwrite=overwrite,
output_parent=output_parent,
)
for url in urls
]
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
result = future.result()
if __name__ == "__main__":
urls = [
"https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
"https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
"https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
]
batch_webpage_fetcher = BatchWebpageFetcher()
batch_webpage_fetcher.fetch(
urls=urls, overwrite=True, output_parent="python tutorials"
)
|