File size: 3,881 Bytes
cfd3735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from typing import Any

import pytest

from langchain.document_loaders import SitemapLoader


def test_sitemap() -> None:
    """Test sitemap loader."""
    loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
    documents = loader.load()
    assert len(documents) > 1
    assert "πŸ¦œπŸ”—" in documents[0].page_content


def test_sitemap_block() -> None:
    """Test sitemap loader."""
    loader = SitemapLoader(
        "https://langchain.readthedocs.io/sitemap.xml", blocksize=1, blocknum=1
    )
    documents = loader.load()
    assert len(documents) == 1
    assert "πŸ¦œπŸ”—" in documents[0].page_content


def test_sitemap_block_only_one() -> None:
    """Test sitemap loader."""
    loader = SitemapLoader(
        "https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000, blocknum=0
    )
    documents = loader.load()
    assert len(documents) > 1
    assert "πŸ¦œπŸ”—" in documents[0].page_content


def test_sitemap_block_blocknum_default() -> None:
    """Test sitemap loader."""
    loader = SitemapLoader(
        "https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000
    )
    documents = loader.load()
    assert len(documents) > 1
    assert "πŸ¦œπŸ”—" in documents[0].page_content


def test_sitemap_block_size_to_small() -> None:
    """Test sitemap loader."""
    with pytest.raises(ValueError, match="Sitemap blocksize should be at least 1"):
        SitemapLoader("https://langchain.readthedocs.io/sitemap.xml", blocksize=0)


def test_sitemap_block_num_to_small() -> None:
    """Test sitemap loader."""
    with pytest.raises(ValueError, match="Sitemap blocknum can not be lower then 0"):
        SitemapLoader(
            "https://langchain.readthedocs.io/sitemap.xml",
            blocksize=1000000,
            blocknum=-1,
        )


def test_sitemap_block_does_not_exists() -> None:
    """Test sitemap loader."""
    loader = SitemapLoader(
        "https://langchain.readthedocs.io/sitemap.xml", blocksize=1000000, blocknum=15
    )
    with pytest.raises(
        ValueError,
        match="Selected sitemap does not contain enough blocks for given blocknum",
    ):
        loader.load()


def test_filter_sitemap() -> None:
    """Test sitemap loader."""
    loader = SitemapLoader(
        "https://langchain.readthedocs.io/sitemap.xml",
        filter_urls=["https://python.langchain.com/en/stable/"],
    )
    documents = loader.load()
    assert len(documents) == 1
    assert "πŸ¦œπŸ”—" in documents[0].page_content


def test_sitemap_metadata() -> None:
    def sitemap_metadata_one(meta: dict, _content: None) -> dict:
        return {**meta, "mykey": "Super Important Metadata"}

    """Test sitemap loader."""
    loader = SitemapLoader(
        "https://langchain.readthedocs.io/sitemap.xml",
        meta_function=sitemap_metadata_one,
    )
    documents = loader.load()
    assert len(documents) > 1
    assert "mykey" in documents[0].metadata
    assert "Super Important Metadata" in documents[0].metadata["mykey"]


def test_sitemap_metadata_extraction() -> None:
    def sitemap_metadata_two(meta: dict, content: Any) -> dict:
        title = content.find("title")
        if title:
            return {**meta, "title": title.get_text()}
        return meta

    """Test sitemap loader."""
    loader = SitemapLoader(
        "https://langchain.readthedocs.io/sitemap.xml",
        meta_function=sitemap_metadata_two,
    )
    documents = loader.load()
    assert len(documents) > 1
    assert "title" in documents[0].metadata
    assert "LangChain" in documents[0].metadata["title"]


def test_sitemap_metadata_default() -> None:
    """Test sitemap loader."""
    loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
    documents = loader.load()
    assert len(documents) > 1
    assert "source" in documents[0].metadata
    assert "loc" in documents[0].metadata