File size: 1,223 Bytes
cfd3735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import sys
from pathlib import Path

import pytest

from langchain.document_loaders.html_bs import BSHTMLLoader


def test_bs_html_loader() -> None:
    """Test unstructured loader."""
    file_path = Path(__file__).parent.parent / "examples/example.html"
    loader = BSHTMLLoader(str(file_path), get_text_separator="|")
    docs = loader.load()

    assert len(docs) == 1

    metadata = docs[0].metadata
    content = docs[0].page_content

    assert metadata["title"] == "Chew dad's slippers"
    assert metadata["source"] == str(file_path)
    assert content[:2] == "\n|"


@pytest.mark.skipif(
    bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
    reason="default encoding is utf8",
)
def test_bs_html_loader_non_utf8() -> None:
    """Test providing encoding to BSHTMLLoader."""
    file_path = Path(__file__).parent.parent / "examples/example-utf8.html"

    with pytest.raises(UnicodeDecodeError):
        BSHTMLLoader(str(file_path)).load()

    loader = BSHTMLLoader(str(file_path), open_encoding="utf8")
    docs = loader.load()

    assert len(docs) == 1

    metadata = docs[0].metadata

    assert metadata["title"] == "Chew dad's slippers"
    assert metadata["source"] == str(file_path)