import json import logging import os import re import chromadb from pydantic.v1 import BaseSettings from dotenv import load_dotenv from fastapi.encoders import jsonable_encoder from langchain.document_loaders import PyPDFLoader from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma load_dotenv() logging.basicConfig(level=logging.DEBUG) ABS_PATH = os.path.dirname(os.path.abspath(__file__)) DB_DIR = os.path.join(ABS_PATH, "db") def replace_newlines_and_spaces(text): # Replace all newline characters with spaces text = text.replace("\n", " ") # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text) return text def get_documents(): return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load() def init_chromadb(): if not os.path.exists(DB_DIR): os.mkdir(DB_DIR) client_settings = chromadb.config.Settings( chroma_db_impl="duckdb+parquet", persist_directory=DB_DIR, anonymized_telemetry=False ) embeddings = OpenAIEmbeddings() vectorstore = Chroma( collection_name="langchain_store", embedding_function=embeddings, client_settings=client_settings, persist_directory=DB_DIR, ) documents = [] for num, doc in enumerate(get_documents()): doc.page_content = replace_newlines_and_spaces(doc.page_content) documents.append(doc) vectorstore.add_documents(documents=documents, embedding=embeddings) vectorstore.persist() print(vectorstore) def query_chromadb(): if not os.path.exists(DB_DIR): raise Exception(f"{DB_DIR} does not exist, nothing can be queried") client_settings = chromadb.config.Settings( chroma_db_impl="duckdb+parquet", persist_directory=DB_DIR, anonymized_telemetry=False ) embeddings = OpenAIEmbeddings() vectorstore = Chroma( collection_name="langchain_store", embedding_function=embeddings, client_settings=client_settings, persist_directory=DB_DIR, ) result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4) jsonable_result = jsonable_encoder(result) print(json.dumps(jsonable_result, indent=2)) def main(): init_chromadb() query_chromadb() if __name__ == '__main__': main()