Spaces:
Sleeping
Sleeping
File size: 5,742 Bytes
cfd3735 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
"""Test Deep Lake functionality."""
import deeplake
import pytest
from pytest import FixtureRequest
from langchain.docstore.document import Document
from langchain.vectorstores import DeepLake
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
@pytest.fixture
def deeplake_datastore() -> DeepLake:
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = DeepLake.from_texts(
dataset_path="mem://test_path",
texts=texts,
metadatas=metadatas,
embedding=FakeEmbeddings(),
)
return docsearch
@pytest.fixture(params=["L1", "L2", "max", "cos"])
def distance_metric(request: FixtureRequest) -> str:
return request.param
def test_deeplake() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = DeepLake.from_texts(
dataset_path="mem://test_path", texts=texts, embedding=FakeEmbeddings()
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_deeplake_with_metadatas() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = DeepLake.from_texts(
dataset_path="mem://test_path",
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
def test_deeplakewith_persistence() -> None:
"""Test end to end construction and search, with persistence."""
dataset_path = "./tests/persist_dir"
if deeplake.exists(dataset_path):
deeplake.delete(dataset_path)
texts = ["foo", "bar", "baz"]
docsearch = DeepLake.from_texts(
dataset_path=dataset_path,
texts=texts,
embedding=FakeEmbeddings(),
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
docsearch.persist()
# Get a new VectorStore from the persisted directory
docsearch = DeepLake(
dataset_path=dataset_path,
embedding_function=FakeEmbeddings(),
)
output = docsearch.similarity_search("foo", k=1)
# Clean up
docsearch.delete_dataset()
# Persist doesn't need to be called again
# Data will be automatically persisted on object deletion
# Or on program exit
def test_similarity_search(deeplake_datastore: DeepLake, distance_metric: str) -> None:
"""Test similarity search."""
output = deeplake_datastore.similarity_search(
"foo", k=1, distance_metric=distance_metric
)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
deeplake_datastore.delete_dataset()
def test_similarity_search_by_vector(
deeplake_datastore: DeepLake, distance_metric: str
) -> None:
"""Test similarity search by vector."""
embeddings = FakeEmbeddings().embed_documents(["foo", "bar", "baz"])
output = deeplake_datastore.similarity_search_by_vector(
embeddings[1], k=1, distance_metric=distance_metric
)
assert output == [Document(page_content="bar", metadata={"page": "1"})]
deeplake_datastore.delete_dataset()
def test_similarity_search_with_score(
deeplake_datastore: DeepLake, distance_metric: str
) -> None:
"""Test similarity search with score."""
output, score = deeplake_datastore.similarity_search_with_score(
"foo", k=1, distance_metric=distance_metric
)[0]
assert output == Document(page_content="foo", metadata={"page": "0"})
if distance_metric == "cos":
assert score == 1.0
else:
assert score == 0.0
deeplake_datastore.delete_dataset()
def test_similarity_search_with_filter(
deeplake_datastore: DeepLake, distance_metric: str
) -> None:
"""Test similarity search."""
output = deeplake_datastore.similarity_search(
"foo", k=1, distance_metric=distance_metric, filter={"page": "1"}
)
assert output == [Document(page_content="bar", metadata={"page": "1"})]
deeplake_datastore.delete_dataset()
def test_max_marginal_relevance_search(deeplake_datastore: DeepLake) -> None:
"""Test max marginal relevance search by vector."""
output = deeplake_datastore.max_marginal_relevance_search("foo", k=1, fetch_k=2)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
embeddings = FakeEmbeddings().embed_documents(["foo", "bar", "baz"])
output = deeplake_datastore.max_marginal_relevance_search_by_vector(
embeddings[0], k=1, fetch_k=2
)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
deeplake_datastore.delete_dataset()
def test_delete_dataset_by_ids(deeplake_datastore: DeepLake) -> None:
"""Test delete dataset."""
id = deeplake_datastore.ds.ids.data()["value"][0]
deeplake_datastore.delete(ids=[id])
assert deeplake_datastore.similarity_search("foo", k=1, filter={"page": "0"}) == []
assert len(deeplake_datastore.ds) == 2
deeplake_datastore.delete_dataset()
def test_delete_dataset_by_filter(deeplake_datastore: DeepLake) -> None:
"""Test delete dataset."""
deeplake_datastore.delete(filter={"page": "1"})
assert deeplake_datastore.similarity_search("bar", k=1, filter={"page": "1"}) == []
assert len(deeplake_datastore.ds) == 2
deeplake_datastore.delete_dataset()
def test_delete_by_path(deeplake_datastore: DeepLake) -> None:
"""Test delete dataset."""
path = deeplake_datastore.dataset_path
DeepLake.force_delete_by_path(path)
assert not deeplake.exists(path)
|