File size: 5,043 Bytes
108f56d
06e8209
 
 
 
6c1e24c
06e8209
45e4a54
190af86
6c1e24c
de136b7
06e8209
6c1e24c
 
 
06e8209
98e5fa6
 
06e8209
 
 
 
98e5fa6
35a54f7
98e5fa6
06e8209
 
 
 
 
 
 
 
 
 
ac3eb9e
06e8209
 
 
6c1e24c
 
 
 
06e8209
 
 
 
 
 
 
6c1e24c
 
 
 
 
35a54f7
ffa4e8a
6c1e24c
 
 
87b12fd
35a54f7
4bfd6da
4399bb2
 
4bfd6da
 
 
4399bb2
 
4bfd6da
4399bb2
4bfd6da
4399bb2
4ffac05
06e8209
 
 
 
 
6c1e24c
68c6a03
98e5fa6
ffa4e8a
6c1e24c
98e5fa6
87b12fd
06e8209
7f3a89f
 
06e8209
 
 
 
df9d7da
06e8209
 
108f56d
 
 
 
426a78e
 
108f56d
 
 
426a78e
108f56d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cad60c2
 
 
 
108f56d
cad60c2
108f56d
 
 
 
 
 
 
 
 
 
 
 
 
cad60c2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
import json
import logging
import os
import re
import sys

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
#from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from fastapi.encoders import jsonable_encoder
from dotenv import load_dotenv

#load_dotenv()
#logging.basicConfig(level=logging.DEBUG)

ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")

vectorstore = None
#embedding_function
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

def replace_newlines_and_spaces(text):
    # Replace all newline characters with spaces
    text = text.replace("\n", " ")
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text


def get_documents():
    return PyPDFLoader("AI-smart-water-management-systems.pdf").load()


def init_chromadb():
    # Delete existing index directory and recreate the directory
    if os.path.exists(DB_DIR):
        import shutil
        shutil.rmtree(DB_DIR, ignore_errors=True)
        os.mkdir(DB_DIR)

    documents = []
    for num, doc in enumerate(get_documents()):
        doc.page_content = replace_newlines_and_spaces(doc.page_content)
        documents.append(doc)

    # Split the documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)

    # Select which embeddings we want to use
    #embeddings = OpenAIEmbeddings()
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # Create the vectorestore to use as the index
    vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
    #vectorstore.persist()
    
    #print(vectorstore)
    #vectorstore = None

    #db = vectorstore
    #db.get()
    #print(len(db.get()["ids"]))

    # Print the list of source files
    for x in range(len(vectorstore.get()["ids"])):
        # print(db.get()["metadatas"][x])
        doc = vectorstore.get()["metadatas"][x]
        source = doc["source"]
        print("Source {x} :: ",source)

def query_chromadb():
    if not os.path.exists(DB_DIR):
        raise Exception(f"{DB_DIR} does not exist, nothing can be queried")

    # Select which embeddings we want to use
    #embeddings = OpenAIEmbeddings()
    #embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    # Load Vector store from local disk
    #vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
    #vectorstore.persist()

    result = vectorstore.similarity_search_with_score(query="how to use AI in water conservation?", k=4)
    
    jsonable_result = jsonable_encoder(result)
    print(json.dumps(jsonable_result, indent=2))

def main():
    init_chromadb()

if __name__ == '__main__':
    main()
"""
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
#from llama_index.core import VectorStoreIndex, StorageContext, TextNode
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, PromptTemplate
from llama_index.core.indices.vector_store.retrievers import VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
#from llama_index.llms.huggingface import HuggingFaceLLM

# Initialize ChromaDB client and collection
chroma_client = chromadb.HttpClient(host="localhost", port="8080", ssl=False)
chroma_collection = chroma_client.get_or_create_collection("example_collection")

# Define embedding function using HuggingFace
embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")

# Initialize ChromaVectorStore with the collection
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Set up StorageContext and VectorStoreIndex
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(embed_model=embed_model, storage_context=storage_context)

# Define and load documents with embeddings
documents = [
    {
        "text": "Your document text here", "embedding": [0.1, 0.2, 0.3],
        'cricket': DirectoryLoader('/content/cricket', glob="*.pdf", loader_cls=PyPDFLoader).load(),
        'fifa': DirectoryLoader('/content/fifa', glob="*.pdf", loader_cls=PyPDFLoader).load(),
    # Add more documents as needed
    },
]

# Load documents into ChromaDB using VectorStoreIndex
index.from_documents(documents=documents)

# Initialize the AutoRetriever with VectorStoreIndex and VectorStoreInfo
auto_retriever = VectorIndexAutoRetriever(index)

# Set up the RetrieverQuery Engine with the AutoRetriever
query_engine = RetrieverQueryEngine(auto_retriever)

# Query documents using the RetrieverQuery Engine
response = query_engine.query("Your query here")
print(response)