File size: 2,829 Bytes
06e8209 6c1e24c 06e8209 45e4a54 190af86 6c1e24c de136b7 06e8209 6c1e24c 06e8209 6c1e24c 06e8209 35a54f7 06e8209 ac3eb9e 06e8209 6c1e24c 06e8209 6c1e24c 35a54f7 6c1e24c 06e8209 35a54f7 06e8209 4399bb2 35a54f7 4399bb2 06e8209 6c1e24c 06e8209 6c1e24c 7f3a89f 06e8209 7f3a89f 06e8209 df9d7da 06e8209 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import json
import logging
import os
import re
import sys
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
#from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from fastapi.encoders import jsonable_encoder
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.DEBUG)
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
#embedding_function
def replace_newlines_and_spaces(text):
# Replace all newline characters with spaces
text = text.replace("\n", " ")
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
return text
def get_documents():
return PyPDFLoader("AI-smart-water-management-systems.pdf").load()
def init_chromadb():
# Delete existing index directory and recreate the directory
if os.path.exists(DB_DIR):
import shutil
shutil.rmtree(DB_DIR, ignore_errors=True)
os.mkdir(DB_DIR)
documents = []
for num, doc in enumerate(get_documents()):
doc.page_content = replace_newlines_and_spaces(doc.page_content)
documents.append(doc)
# Split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# Select which embeddings we want to use
#embeddings = OpenAIEmbeddings()
# Create the vectorestore to use as the index
vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
vectorstore.persist()
print(vectorstore)
#vectorstore = None
db = vectorstore
db.get()
print(len(db.get()["ids"]))
# Print the list of source files
for x in range(len(db.get()["ids"])):
# print(db.get()["metadatas"][x])
doc = db.get()["metadatas"][x]
source = doc["source"]
print(source)
def query_chromadb():
if not os.path.exists(DB_DIR):
raise Exception(f"{DB_DIR} does not exist, nothing can be queried")
# Select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# Load Vector store from local disk
vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
db2.persist()
result = vectorstore.similarity_search_with_score(query="how to use AI in water conservation?", k=4)
jsonable_result = jsonable_encoder(result)
print(json.dumps(jsonable_result, indent=2))
def main():
init_chromadb()
query_chromadb()
if __name__ == '__main__':
main() |