File size: 2,829 Bytes
06e8209
 
 
 
6c1e24c
06e8209
45e4a54
190af86
6c1e24c
de136b7
06e8209
6c1e24c
 
 
06e8209
 
6c1e24c
06e8209
 
 
 
35a54f7
 
06e8209
 
 
 
 
 
 
 
 
 
ac3eb9e
06e8209
 
 
6c1e24c
 
 
 
06e8209
 
 
 
 
 
 
6c1e24c
 
 
 
 
35a54f7
6c1e24c
 
 
06e8209
35a54f7
06e8209
4399bb2
 
 
35a54f7
4399bb2
 
 
 
 
 
 
 
06e8209
 
 
 
 
6c1e24c
06e8209
6c1e24c
 
7f3a89f
06e8209
7f3a89f
 
06e8209
 
 
 
df9d7da
06e8209
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import json
import logging
import os
import re
import sys

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
#from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from fastapi.encoders import jsonable_encoder
from dotenv import load_dotenv

load_dotenv()
logging.basicConfig(level=logging.DEBUG)

ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
#embedding_function

def replace_newlines_and_spaces(text):
    # Replace all newline characters with spaces
    text = text.replace("\n", " ")
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text


def get_documents():
    return PyPDFLoader("AI-smart-water-management-systems.pdf").load()


def init_chromadb():
    # Delete existing index directory and recreate the directory
    if os.path.exists(DB_DIR):
        import shutil
        shutil.rmtree(DB_DIR, ignore_errors=True)
        os.mkdir(DB_DIR)

    documents = []
    for num, doc in enumerate(get_documents()):
        doc.page_content = replace_newlines_and_spaces(doc.page_content)
        documents.append(doc)

    # Split the documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)

    # Select which embeddings we want to use
    #embeddings = OpenAIEmbeddings()

    # Create the vectorestore to use as the index
    vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
    vectorstore.persist()
    
    print(vectorstore)
    #vectorstore = None

    db = vectorstore
    db.get()
    print(len(db.get()["ids"]))

    # Print the list of source files
    for x in range(len(db.get()["ids"])):
        # print(db.get()["metadatas"][x])
        doc = db.get()["metadatas"][x]
        source = doc["source"]
        print(source)

def query_chromadb():
    if not os.path.exists(DB_DIR):
        raise Exception(f"{DB_DIR} does not exist, nothing can be queried")

    # Select which embeddings we want to use
    embeddings = OpenAIEmbeddings()
    # Load Vector store from local disk
    vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
    db2.persist()

    result = vectorstore.similarity_search_with_score(query="how to use AI in water conservation?", k=4)
    
    jsonable_result = jsonable_encoder(result)
    print(json.dumps(jsonable_result, indent=2))

def main():
    init_chromadb()
    query_chromadb()

if __name__ == '__main__':
    main()