nightfury commited on
Commit
6c1e24c
·
verified ·
1 Parent(s): 4ea4dfc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -38
app.py CHANGED
@@ -2,17 +2,17 @@ import json
2
  import logging
3
  import os
4
  import re
 
5
 
6
- import chromadb
7
- #from pydantic.v1 import BaseSettings
8
- from dotenv import load_dotenv
9
- from fastapi.encoders import jsonable_encoder
10
- from langchain.document_loaders import PyPDFLoader
11
  from langchain.embeddings import OpenAIEmbeddings
12
  from langchain.vectorstores import Chroma
 
 
 
13
 
14
  load_dotenv()
15
- #logging.basicConfig(level=logging.DEBUG)
16
 
17
  ABS_PATH = os.path.dirname(os.path.abspath(__file__))
18
  DB_DIR = os.path.join(ABS_PATH, "db")
@@ -21,10 +21,8 @@ DB_DIR = os.path.join(ABS_PATH, "db")
21
  def replace_newlines_and_spaces(text):
22
  # Replace all newline characters with spaces
23
  text = text.replace("\n", " ")
24
-
25
  # Replace multiple spaces with a single space
26
  text = re.sub(r'\s+', ' ', text)
27
-
28
  return text
29
 
30
 
@@ -33,59 +31,46 @@ def get_documents():
33
 
34
 
35
  def init_chromadb():
36
- if not os.path.exists(DB_DIR):
 
 
 
37
  os.mkdir(DB_DIR)
38
 
39
- client_settings = chromadb.config.Settings(
40
- chroma_db_impl="duckdb+parquet",
41
- persist_directory=DB_DIR,
42
- anonymized_telemetry=False
43
- )
44
- embeddings = OpenAIEmbeddings()
45
-
46
- vectorstore = Chroma(
47
- collection_name="langchain_store",
48
- embedding_function=embeddings,
49
- client_settings=client_settings,
50
- persist_directory=DB_DIR,
51
- )
52
  documents = []
53
  for num, doc in enumerate(get_documents()):
54
  doc.page_content = replace_newlines_and_spaces(doc.page_content)
55
  documents.append(doc)
56
 
57
- vectorstore.add_documents(documents=documents, embedding=embeddings)
 
 
 
 
 
 
 
 
58
  vectorstore.persist()
59
  print(vectorstore)
60
-
61
 
62
  def query_chromadb():
63
  if not os.path.exists(DB_DIR):
64
  raise Exception(f"{DB_DIR} does not exist, nothing can be queried")
65
 
66
- client_settings = chromadb.config.Settings(
67
- chroma_db_impl="duckdb+parquet",
68
- persist_directory=DB_DIR,
69
- anonymized_telemetry=False
70
- )
71
-
72
  embeddings = OpenAIEmbeddings()
 
 
73
 
74
- vectorstore = Chroma(
75
- collection_name="langchain_store",
76
- embedding_function=embeddings,
77
- client_settings=client_settings,
78
- persist_directory=DB_DIR,
79
- )
80
  result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4)
81
  jsonable_result = jsonable_encoder(result)
82
  print(json.dumps(jsonable_result, indent=2))
83
 
84
-
85
  def main():
86
  init_chromadb()
87
  query_chromadb()
88
 
89
-
90
  if __name__ == '__main__':
91
  main()
 
2
  import logging
3
  import os
4
  import re
5
+ import sys
6
 
7
+ from langchain.text_splitter import CharacterTextSplitter
 
 
 
 
8
  from langchain.embeddings import OpenAIEmbeddings
9
  from langchain.vectorstores import Chroma
10
+ from langchain.document_loaders import PyPDFLoader
11
+ from fastapi.encoders import jsonable_encoder
12
+ from dotenv import load_dotenv
13
 
14
  load_dotenv()
15
+ logging.basicConfig(level=logging.DEBUG)
16
 
17
  ABS_PATH = os.path.dirname(os.path.abspath(__file__))
18
  DB_DIR = os.path.join(ABS_PATH, "db")
 
21
  def replace_newlines_and_spaces(text):
22
  # Replace all newline characters with spaces
23
  text = text.replace("\n", " ")
 
24
  # Replace multiple spaces with a single space
25
  text = re.sub(r'\s+', ' ', text)
 
26
  return text
27
 
28
 
 
31
 
32
 
33
  def init_chromadb():
34
+ # Delete existing index directory and recreate the directory
35
+ if os.path.exists(DB_DIR):
36
+ import shutil
37
+ shutil.rmtree(DB_DIR, ignore_errors=True)
38
  os.mkdir(DB_DIR)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  documents = []
41
  for num, doc in enumerate(get_documents()):
42
  doc.page_content = replace_newlines_and_spaces(doc.page_content)
43
  documents.append(doc)
44
 
45
+ # Split the documents into chunks
46
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
47
+ texts = text_splitter.split_documents(documents)
48
+
49
+ # Select which embeddings we want to use
50
+ embeddings = OpenAIEmbeddings()
51
+
52
+ # Create the vectorestore to use as the index
53
+ vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
54
  vectorstore.persist()
55
  print(vectorstore)
56
+ vectorstore = None
57
 
58
  def query_chromadb():
59
  if not os.path.exists(DB_DIR):
60
  raise Exception(f"{DB_DIR} does not exist, nothing can be queried")
61
 
62
+ # Select which embeddings we want to use
 
 
 
 
 
63
  embeddings = OpenAIEmbeddings()
64
+ # Load Vector store from local disk
65
+ vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
66
 
 
 
 
 
 
 
67
  result = vectorstore.similarity_search_with_score(query="who is FREDERICK?", k=4)
68
  jsonable_result = jsonable_encoder(result)
69
  print(json.dumps(jsonable_result, indent=2))
70
 
 
71
  def main():
72
  init_chromadb()
73
  query_chromadb()
74
 
 
75
  if __name__ == '__main__':
76
  main()