Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
|
6 |
+
import chromadb
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
from fastapi.encoders import jsonable_encoder
|
9 |
+
from langchain.document_loaders import PyPDFLoader
|
10 |
+
from langchain.embeddings import OpenAIEmbeddings
|
11 |
+
from langchain.vectorstores import Chroma
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
logging.basicConfig(level=logging.DEBUG)
|
15 |
+
|
16 |
+
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
|
17 |
+
DB_DIR = os.path.join(ABS_PATH, "db")
|
18 |
+
|
19 |
+
|
20 |
+
def replace_newlines_and_spaces(text):
|
21 |
+
# Replace all newline characters with spaces
|
22 |
+
text = text.replace("\n", " ")
|
23 |
+
|
24 |
+
# Replace multiple spaces with a single space
|
25 |
+
text = re.sub(r'\s+', ' ', text)
|
26 |
+
|
27 |
+
return text
|
28 |
+
|
29 |
+
|
30 |
+
def get_documents():
|
31 |
+
return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load()
|
32 |
+
|
33 |
+
|
34 |
+
def init_chromadb():
|
35 |
+
if not os.path.exists(DB_DIR):
|
36 |
+
os.mkdir(DB_DIR)
|
37 |
+
|
38 |
+
client_settings = chromadb.config.Settings(
|
39 |
+
chroma_db_impl="duckdb+parquet",
|
40 |
+
persist_directory=DB_DIR,
|
41 |
+
anonymized_telemetry=False
|
42 |
+
)
|
43 |
+
embeddings = OpenAIEmbeddings()
|
44 |
+
|
45 |
+
vectorstore = Chroma(
|
46 |
+
collection_name="langchain_store",
|
47 |
+
embedding_function=embeddings,
|
48 |
+
client_settings=client_settings,
|
49 |
+
persist_directory=DB_DIR,
|
50 |
+
)
|
51 |
+
documents = []
|
52 |
+
for num, doc in enumerate(get_documents()):
|
53 |
+
doc.page_content = replace_newlines_and_spaces(doc.page_content)
|
54 |
+
documents.append(doc)
|
55 |
+
|
56 |
+
vectorstore.add_documents(documents=documents, embedding=embeddings)
|
57 |
+
vectorstore.persist()
|
58 |
+
print(vectorstore)
|
59 |
+
|
60 |
+
|
61 |
+
def query_chromadb():
|
62 |
+
if not os.path.exists(DB_DIR):
|
63 |
+
raise Exception(f"{DB_DIR} does not exist, nothing can be queried")
|
64 |
+
|
65 |
+
client_settings = chromadb.config.Settings(
|
66 |
+
chroma_db_impl="duckdb+parquet",
|
67 |
+
persist_directory=DB_DIR,
|
68 |
+
anonymized_telemetry=False
|
69 |
+
)
|
70 |
+
|
71 |
+
embeddings = OpenAIEmbeddings()
|
72 |
+
|
73 |
+
vectorstore = Chroma(
|
74 |
+
collection_name="langchain_store",
|
75 |
+
embedding_function=embeddings,
|
76 |
+
client_settings=client_settings,
|
77 |
+
persist_directory=DB_DIR,
|
78 |
+
)
|
79 |
+
result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4)
|
80 |
+
jsonable_result = jsonable_encoder(result)
|
81 |
+
print(json.dumps(jsonable_result, indent=2))
|
82 |
+
|
83 |
+
|
84 |
+
def main():
|
85 |
+
#init_chromadb()
|
86 |
+
query_chromadb()
|
87 |
+
|
88 |
+
|
89 |
+
if __name__ == '__main__':
|
90 |
+
main()
|