Spaces:
Sleeping
Sleeping
"""Module to test generic parsers.""" | |
from typing import Iterator | |
import pytest | |
from langchain.document_loaders.base import BaseBlobParser | |
from langchain.document_loaders.blob_loaders import Blob | |
from langchain.document_loaders.parsers.generic import MimeTypeBasedParser | |
from langchain.schema import Document | |
class TestMimeBasedParser: | |
"""Test mime based parser.""" | |
def test_without_fallback_parser(self) -> None: | |
class FirstCharParser(BaseBlobParser): | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
"""Extract the first character of a blob.""" | |
yield Document(page_content=blob.as_string()[0]) | |
class SecondCharParser(BaseBlobParser): | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
"""Extract the second character of a blob.""" | |
yield Document(page_content=blob.as_string()[1]) | |
parser = MimeTypeBasedParser( | |
handlers={ | |
"text/plain": FirstCharParser(), | |
"text/html": SecondCharParser(), | |
}, | |
) | |
blob = Blob(data=b"Hello World", mimetype="text/plain") | |
docs = parser.parse(blob) | |
assert len(docs) == 1 | |
doc = docs[0] | |
assert doc.page_content == "H" | |
# Check text/html handler. | |
blob = Blob(data=b"Hello World", mimetype="text/html") | |
docs = parser.parse(blob) | |
assert len(docs) == 1 | |
doc = docs[0] | |
assert doc.page_content == "e" | |
blob = Blob(data=b"Hello World", mimetype="text/csv") | |
with pytest.raises(ValueError, match="Unsupported mime type"): | |
# Check that the fallback parser is used when the mimetype is not found. | |
parser.parse(blob) | |
def test_with_fallback_parser(self) -> None: | |
class FirstCharParser(BaseBlobParser): | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
"""Extract the first character of a blob.""" | |
yield Document(page_content=blob.as_string()[0]) | |
class SecondCharParser(BaseBlobParser): | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
"""Extract the second character of a blob.""" | |
yield Document(page_content=blob.as_string()[1]) | |
class ThirdCharParser(BaseBlobParser): | |
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | |
"""Extract the third character of a blob.""" | |
yield Document(page_content=blob.as_string()[2]) | |
parser = MimeTypeBasedParser( | |
handlers={ | |
"text/plain": FirstCharParser(), | |
"text/html": SecondCharParser(), | |
}, | |
fallback_parser=ThirdCharParser(), | |
) | |
blob = Blob(data=b"Hello World", mimetype="text/plain") | |
docs = parser.parse(blob) | |
assert len(docs) == 1 | |
doc = docs[0] | |
assert doc.page_content == "H" | |
# Check text/html handler. | |
blob = Blob(data=b"Hello World", mimetype="text/html") | |
docs = parser.parse(blob) | |
assert len(docs) == 1 | |
doc = docs[0] | |
assert doc.page_content == "e" | |
# Check that the fallback parser is used when the mimetype is not found. | |
blob = Blob(data=b"Hello World", mimetype="text/csv") | |
docs = parser.parse(blob) | |
assert len(docs) == 1 | |
doc = docs[0] | |
assert doc.page_content == "l" | |