Spaces:
Running
Running
import os | |
from typing import Generator, List, Union | |
import pytest | |
from vcr.request import Request | |
from langchain.document_loaders import TextLoader | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.schema import Document | |
from langchain.text_splitter import CharacterTextSplitter | |
# Those environment variables turn on Deep Lake pytest mode. | |
# It significantly makes tests run much faster. | |
# Need to run before `import deeplake` | |
os.environ["BUGGER_OFF"] = "true" | |
os.environ["DEEPLAKE_DOWNLOAD_PATH"] = "./testing/local_storage" | |
os.environ["DEEPLAKE_PYTEST_ENABLED"] = "true" | |
# This fixture returns a dictionary containing filter_headers options | |
# for replacing certain headers with dummy values during cassette playback | |
# Specifically, it replaces the authorization header with a dummy value to | |
# prevent sensitive data from being recorded in the cassette. | |
# It also filters request to certain hosts (specified in the `ignored_hosts` list) | |
# to prevent data from being recorded in the cassette. | |
def vcr_config() -> dict: | |
skipped_host = ["pinecone.io"] | |
def before_record_response(response: dict) -> Union[dict, None]: | |
return response | |
def before_record_request(request: Request) -> Union[Request, None]: | |
for host in skipped_host: | |
if request.host.startswith(host) or request.host.endswith(host): | |
return None | |
return request | |
return { | |
"before_record_request": before_record_request, | |
"before_record_response": before_record_response, | |
"filter_headers": [ | |
("authorization", "authorization-DUMMY"), | |
("X-OpenAI-Client-User-Agent", "X-OpenAI-Client-User-Agent-DUMMY"), | |
("Api-Key", "Api-Key-DUMMY"), | |
("User-Agent", "User-Agent-DUMMY"), | |
], | |
"ignore_localhost": True, | |
} | |
# Define a fixture that yields a generator object returning a list of documents | |
def documents() -> Generator[List[Document], None, None]: | |
"""Return a generator that yields a list of documents.""" | |
# Create a CharacterTextSplitter object for splitting the documents into chunks | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
# Load the documents from a file located in the fixtures directory | |
documents = TextLoader( | |
os.path.join(os.path.dirname(__file__), "fixtures", "sharks.txt") | |
).load() | |
# Yield the documents split into chunks | |
yield text_splitter.split_documents(documents) | |
def texts() -> Generator[List[str], None, None]: | |
# Load the documents from a file located in the fixtures directory | |
documents = TextLoader( | |
os.path.join(os.path.dirname(__file__), "fixtures", "sharks.txt") | |
).load() | |
yield [doc.page_content for doc in documents] | |
def embedding_openai() -> OpenAIEmbeddings: | |
return OpenAIEmbeddings() | |