File size: 1,517 Bytes
ac493ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import logging
from pathlib import Path
from urllib.parse import urlparse

import scrapy

logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)


class DocsSpider(scrapy.Spider):
    name = "docs"

    def __init__(self, homepage_url: str, save_dir="crawled_pages", *args, **kwargs):
        super(DocsSpider, self).__init__(*args, **kwargs)

        if not homepage_url.startswith("https://"):
            homepage_url = "https://" + homepage_url

        project: str = homepage_url.split(".")[0].split("https://")[1]
        self.allowed_domains = [f"{project}.readthedocs.io"]
        self.start_urls = [homepage_url]
        self.base_dir = Path(save_dir)

    def parse(self, response):
        parsed_uri = urlparse(response.url)
        # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
        if parsed_uri.path.endswith("/"):
            filepath = (
                self.base_dir
                / parsed_uri.netloc
                / parsed_uri.path.strip("/")
                / "index.html"
            )
        else:
            filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
        filepath.parent.mkdir(parents=True, exist_ok=True)

        print(f"{filepath=}")
        with open(filepath, "wb") as f:
            f.write(response.body)

        # Follow links to other documentation pages
        for href in response.css("a::attr(href)").getall():
            yield response.follow(href, self.parse)