Spaces:
Runtime error
Runtime error
File size: 1,517 Bytes
ac493ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import logging
from pathlib import Path
from urllib.parse import urlparse
import scrapy
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
class DocsSpider(scrapy.Spider):
name = "docs"
def __init__(self, homepage_url: str, save_dir="crawled_pages", *args, **kwargs):
super(DocsSpider, self).__init__(*args, **kwargs)
if not homepage_url.startswith("https://"):
homepage_url = "https://" + homepage_url
project: str = homepage_url.split(".")[0].split("https://")[1]
self.allowed_domains = [f"{project}.readthedocs.io"]
self.start_urls = [homepage_url]
self.base_dir = Path(save_dir)
def parse(self, response):
parsed_uri = urlparse(response.url)
# Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
if parsed_uri.path.endswith("/"):
filepath = (
self.base_dir
/ parsed_uri.netloc
/ parsed_uri.path.strip("/")
/ "index.html"
)
else:
filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
filepath.parent.mkdir(parents=True, exist_ok=True)
print(f"{filepath=}")
with open(filepath, "wb") as f:
f.write(response.body)
# Follow links to other documentation pages
for href in response.css("a::attr(href)").getall():
yield response.follow(href, self.parse)
|