Spaces:
Running
Running
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Iterator | |
| from langchain.docstore.document import Document | |
| from langchain.document_loaders import ReadTheDocsLoader | |
| class RTDHtmlPageLoader(ReadTheDocsLoader): | |
| """directory path for readthedocs documents | |
| $ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/ | |
| $ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/ | |
| """ | |
| def __init__(self, inputfile: Path, *args, **kwargs): | |
| kwargs["custom_html_tag"] = ("div", {"id": "docs-content"}) | |
| super().__init__(inputfile, *args, **kwargs) | |
| def _my_clean_data(self, data: str) -> str: | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(data, **self.bs_kwargs) | |
| # default tags | |
| html_tags = [ | |
| ("div", {"role": "main"}), | |
| ("main", {"id": "main-content"}), | |
| ] | |
| if self.custom_html_tag is not None: | |
| html_tags.append(self.custom_html_tag) | |
| text = None | |
| # reversed order. check the custom one first | |
| for tag, attrs in html_tags[::-1]: | |
| text = soup.find(tag, attrs) | |
| # if found, break | |
| if text is not None: | |
| break | |
| if text is not None: | |
| title = "".join(t.text for t in text.find("h1") if t.name!="a") | |
| text = text.get_text() | |
| else: | |
| text = "" | |
| title = "" | |
| # trim empty lines | |
| text = "\n".join([t for t in text.split("\n") if t]) | |
| return text, title | |
| def lazy_load(self) -> Iterator[Document]: | |
| """Load documents.""" | |
| for p in self.file_path.rglob("*"): | |
| if p.is_dir(): | |
| continue | |
| # FIXME: utf-8を指定したい | |
| # with open(p, encoding='utf-8', errors='ignore') as f: | |
| with open(p, encoding=self.encoding, errors=self.errors) as f: | |
| text, title = self._my_clean_data(f.read()) | |
| if "docs.djangoproject.com" in p.parts and p.name == "index.html": | |
| # Djangoドキュメントではindex.htmlにアクセスすると404になる | |
| p = p.parent | |
| url = f"https://{str(p)}/" | |
| else: | |
| url = f"https://{str(p)}" | |
| metadata = { | |
| "title": title, | |
| "ctime": int(datetime.now().timestamp()), | |
| "user": "rtd", | |
| "type": "rtd", | |
| "url": url, | |
| "id": str(p), | |
| } | |
| # print(metadata) | |
| yield Document(page_content=text, metadata=metadata) | |
| def load(self) -> list[Document]: | |
| return list(self.lazy_load()) | |