nightfury commited on
Commit
06e8209
·
verified ·
1 Parent(s): 9b6f465

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -0
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import re
5
+
6
+ import chromadb
7
+ from dotenv import load_dotenv
8
+ from fastapi.encoders import jsonable_encoder
9
+ from langchain.document_loaders import PyPDFLoader
10
+ from langchain.embeddings import OpenAIEmbeddings
11
+ from langchain.vectorstores import Chroma
12
+
13
+ load_dotenv()
14
+ logging.basicConfig(level=logging.DEBUG)
15
+
16
+ ABS_PATH = os.path.dirname(os.path.abspath(__file__))
17
+ DB_DIR = os.path.join(ABS_PATH, "db")
18
+
19
+
20
+ def replace_newlines_and_spaces(text):
21
+ # Replace all newline characters with spaces
22
+ text = text.replace("\n", " ")
23
+
24
+ # Replace multiple spaces with a single space
25
+ text = re.sub(r'\s+', ' ', text)
26
+
27
+ return text
28
+
29
+
30
+ def get_documents():
31
+ return PyPDFLoader("fixtures/pdf/MorseVsFrederick.pdf").load()
32
+
33
+
34
+ def init_chromadb():
35
+ if not os.path.exists(DB_DIR):
36
+ os.mkdir(DB_DIR)
37
+
38
+ client_settings = chromadb.config.Settings(
39
+ chroma_db_impl="duckdb+parquet",
40
+ persist_directory=DB_DIR,
41
+ anonymized_telemetry=False
42
+ )
43
+ embeddings = OpenAIEmbeddings()
44
+
45
+ vectorstore = Chroma(
46
+ collection_name="langchain_store",
47
+ embedding_function=embeddings,
48
+ client_settings=client_settings,
49
+ persist_directory=DB_DIR,
50
+ )
51
+ documents = []
52
+ for num, doc in enumerate(get_documents()):
53
+ doc.page_content = replace_newlines_and_spaces(doc.page_content)
54
+ documents.append(doc)
55
+
56
+ vectorstore.add_documents(documents=documents, embedding=embeddings)
57
+ vectorstore.persist()
58
+ print(vectorstore)
59
+
60
+
61
+ def query_chromadb():
62
+ if not os.path.exists(DB_DIR):
63
+ raise Exception(f"{DB_DIR} does not exist, nothing can be queried")
64
+
65
+ client_settings = chromadb.config.Settings(
66
+ chroma_db_impl="duckdb+parquet",
67
+ persist_directory=DB_DIR,
68
+ anonymized_telemetry=False
69
+ )
70
+
71
+ embeddings = OpenAIEmbeddings()
72
+
73
+ vectorstore = Chroma(
74
+ collection_name="langchain_store",
75
+ embedding_function=embeddings,
76
+ client_settings=client_settings,
77
+ persist_directory=DB_DIR,
78
+ )
79
+ result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4)
80
+ jsonable_result = jsonable_encoder(result)
81
+ print(json.dumps(jsonable_result, indent=2))
82
+
83
+
84
+ def main():
85
+ #init_chromadb()
86
+ query_chromadb()
87
+
88
+
89
+ if __name__ == '__main__':
90
+ main()