File size: 2,699 Bytes
c1dc2ee
 
 
 
 
 
 
 
 
 
 
 
 
 
8d5b271
c1dc2ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cf5bcf
 
c1dc2ee
 
 
4cf5bcf
7f20d45
 
 
 
 
 
c1dc2ee
 
 
 
 
7f20d45
c1dc2ee
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from datetime import datetime
from pathlib import Path
from typing import Iterator

from langchain.docstore.document import Document
from langchain.document_loaders import ReadTheDocsLoader


class RTDHtmlPageLoader(ReadTheDocsLoader):
    """directory path for readthedocs documents

    $ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
    $ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
    """
    def __init__(self, inputfile: Path, *args, **kwargs):
        kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
        super().__init__(inputfile, *args, **kwargs)

    def _my_clean_data(self, data: str) -> str:
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(data, **self.bs_kwargs)

        # default tags
        html_tags = [
            ("div", {"role": "main"}),
            ("main", {"id": "main-content"}),
        ]

        if self.custom_html_tag is not None:
            html_tags.append(self.custom_html_tag)

        text = None

        # reversed order. check the custom one first
        for tag, attrs in html_tags[::-1]:
            text = soup.find(tag, attrs)
            # if found, break
            if text is not None:
                break

        if text is not None:
            title = "".join(t.text for t in text.find("h1") if t.name!="a")
            text = text.get_text()
        else:
            text = ""
            title = ""

        # trim empty lines
        text = "\n".join([t for t in text.split("\n") if t])

        return text, title

    def lazy_load(self) -> Iterator[Document]:
        """Load documents."""
        for p in self.file_path.rglob("*"):
            if p.is_dir():
                continue
            # FIXME: utf-8を指定したい
            # with open(p, encoding='utf-8', errors='ignore') as f:
            with open(p, encoding=self.encoding, errors=self.errors) as f:
                text, title = self._my_clean_data(f.read())

            if "docs.djangoproject.com" in p.parts and p.name == "index.html":
                # Djangoドキュメントではindex.htmlにアクセスすると404になる
                p = p.parent
                url = f"https://{str(p)}/"
            else:
                url = f"https://{str(p)}"

            metadata = {
                "title": title,
                "ctime": int(datetime.now().timestamp()),
                "user": "rtd",
                "type": "rtd",
                "url": url,
                "id": str(p),
            }
            # print(metadata)
            yield Document(page_content=text, metadata=metadata)


    def load(self) -> list[Document]:
        return list(self.lazy_load())