File size: 5,043 Bytes
108f56d 06e8209 6c1e24c 06e8209 45e4a54 190af86 6c1e24c de136b7 06e8209 6c1e24c 06e8209 98e5fa6 06e8209 98e5fa6 35a54f7 98e5fa6 06e8209 ac3eb9e 06e8209 6c1e24c 06e8209 6c1e24c 35a54f7 ffa4e8a 6c1e24c 87b12fd 35a54f7 4bfd6da 4399bb2 4bfd6da 4399bb2 4bfd6da 4399bb2 4bfd6da 4399bb2 4ffac05 06e8209 6c1e24c 68c6a03 98e5fa6 ffa4e8a 6c1e24c 98e5fa6 87b12fd 06e8209 7f3a89f 06e8209 df9d7da 06e8209 108f56d 426a78e 108f56d 426a78e 108f56d cad60c2 108f56d cad60c2 108f56d cad60c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
"""
import json
import logging
import os
import re
import sys
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
#from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from fastapi.encoders import jsonable_encoder
from dotenv import load_dotenv
#load_dotenv()
#logging.basicConfig(level=logging.DEBUG)
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")
vectorstore = None
#embedding_function
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
def replace_newlines_and_spaces(text):
# Replace all newline characters with spaces
text = text.replace("\n", " ")
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
return text
def get_documents():
return PyPDFLoader("AI-smart-water-management-systems.pdf").load()
def init_chromadb():
# Delete existing index directory and recreate the directory
if os.path.exists(DB_DIR):
import shutil
shutil.rmtree(DB_DIR, ignore_errors=True)
os.mkdir(DB_DIR)
documents = []
for num, doc in enumerate(get_documents()):
doc.page_content = replace_newlines_and_spaces(doc.page_content)
documents.append(doc)
# Split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# Select which embeddings we want to use
#embeddings = OpenAIEmbeddings()
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Create the vectorestore to use as the index
vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
#vectorstore.persist()
#print(vectorstore)
#vectorstore = None
#db = vectorstore
#db.get()
#print(len(db.get()["ids"]))
# Print the list of source files
for x in range(len(vectorstore.get()["ids"])):
# print(db.get()["metadatas"][x])
doc = vectorstore.get()["metadatas"][x]
source = doc["source"]
print("Source {x} :: ",source)
def query_chromadb():
if not os.path.exists(DB_DIR):
raise Exception(f"{DB_DIR} does not exist, nothing can be queried")
# Select which embeddings we want to use
#embeddings = OpenAIEmbeddings()
#embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# Load Vector store from local disk
#vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
#vectorstore.persist()
result = vectorstore.similarity_search_with_score(query="how to use AI in water conservation?", k=4)
jsonable_result = jsonable_encoder(result)
print(json.dumps(jsonable_result, indent=2))
def main():
init_chromadb()
if __name__ == '__main__':
main()
"""
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
#from llama_index.core import VectorStoreIndex, StorageContext, TextNode
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, PromptTemplate
from llama_index.core.indices.vector_store.retrievers import VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
#from llama_index.llms.huggingface import HuggingFaceLLM
# Initialize ChromaDB client and collection
chroma_client = chromadb.HttpClient(host="localhost", port="8080", ssl=False)
chroma_collection = chroma_client.get_or_create_collection("example_collection")
# Define embedding function using HuggingFace
embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
# Initialize ChromaVectorStore with the collection
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# Set up StorageContext and VectorStoreIndex
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(embed_model=embed_model, storage_context=storage_context)
# Define and load documents with embeddings
documents = [
{
"text": "Your document text here", "embedding": [0.1, 0.2, 0.3],
'cricket': DirectoryLoader('/content/cricket', glob="*.pdf", loader_cls=PyPDFLoader).load(),
'fifa': DirectoryLoader('/content/fifa', glob="*.pdf", loader_cls=PyPDFLoader).load(),
# Add more documents as needed
},
]
# Load documents into ChromaDB using VectorStoreIndex
index.from_documents(documents=documents)
# Initialize the AutoRetriever with VectorStoreIndex and VectorStoreInfo
auto_retriever = VectorIndexAutoRetriever(index)
# Set up the RetrieverQuery Engine with the AutoRetriever
query_engine = RetrieverQueryEngine(auto_retriever)
# Query documents using the RetrieverQuery Engine
response = query_engine.query("Your query here")
print(response)
|