Spaces:

nightfury
/

ChromaDB

Runtime error

File size: 6,244 Bytes

import os
import json
import re
import sys

import gradio as gr
from huggingface_hub import InferenceClient

from langchain_huggingface import HuggingFaceEmbeddings
#from chromadb.utils import embedding_functions
#from langchain_community.embeddings import SentenceTransformerEmbeddings

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from fastapi.encoders import jsonable_encoder

from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores.faiss import FAISS
from huggingface_hub import snapshot_download

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# Select which embeddings we want to use
#embeddings = OpenAIEmbeddings()
#embeddings = SentenceTransformerEmbeddings(model_name="nomic-ai/nomic-embed-text-v1", model_kwargs={"trust_remote_code":True}) 

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


ABS_PATH = os.path.dirname(os.path.abspath(__file__))
DB_DIR = os.path.join(ABS_PATH, "db")

cache_dir=f"book_cache"

vectorstore = snapshot_download(repo_id="waterdb/book-embeddings",
                                repo_type="dataset",
                                revision="main",
                                allow_patterns=f"book/*", # to download only the one book
                                cache_dir=cache_dir,
                                )
# get path to the `vectorstore` folder that you just downloaded
# we'll look inside the `cache_dir` for the folder we want
target_dir = BOOK

# Walk through the directory tree recursively
for root, dirs, files in os.walk(cache_dir):
    # Check if the target directory is in the list of directories
    if target_dir in dirs:
        # Get the full path of the target directory
        target_path = os.path.join(root, target_dir)

# load embeddings
# this is what was used to create embeddings for the book
embeddings = HuggingFaceInstructEmbeddings(
    embed_instruction="Represent the book passage for retrieval: ",
    query_instruction="Represent the question for retrieving supporting texts from the book passage: "
    )

# load vector store to use with langchain
docsearch = FAISS.load_local(folder_path=target_path, embeddings=embeddings)

# similarity search
question = "Who is big brother?"
search = docsearch.similarity_search(question, k=4)

for item in search:
    print(item.page_content)
    print(f"From page: {item.metadata['page']}")
    print("---")
    
vectorstore = None

def replace_newlines_and_spaces(text):
    # Replace all newline characters with spaces
    text= text.replace('\t\r','')     # tab, enter
    text= text.replace('\xa0','')     # non-breaki
    text = text.replace("\n", " ")
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text


def get_documents():
    return PyPDFLoader("AI-smart-water-management-systems.pdf").load()


def init_chromadb():
    # Delete existing index directory and recreate the directory
    if os.path.exists(DB_DIR):
        import shutil
        shutil.rmtree(DB_DIR, ignore_errors=True)
        os.mkdir(DB_DIR)

    documents = []
    for num, doc in enumerate(get_documents()):
        doc.page_content = replace_newlines_and_spaces(doc.page_content)
        documents.append(doc)

    # Split the documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    #query_chromadb()

    # Create the vectorestore to use as the index
    vectorstore = Chroma.from_documents(texts, embeddings, persist_directory=DB_DIR)
    vectorstore.persist()
    print("vectorstore::", vectorstore)

def query_chromadb(ASK):
    if not os.path.exists(DB_DIR):
        raise Exception(f"{DB_DIR} does not exist, nothing can be queried")

    # Load Vector store from local disk
    vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embeddings)

    result = vectorstore.similarity_search_with_score(query=ASK, k=4)
    jsonable_result = jsonable_encoder(result)
    print("Json pdf response ::", json.dumps(jsonable_result, indent=2))
    #return json.dumps(jsonable_result, indent=2)


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        print ("**message :: ",message)
        
        token = message.choices[0].delta.content

        print ("**token :: ",token)
        
        response += token
       
        print ("**response :: ",response)
        
        yield response
        print ("**query_chromadb::",query_chromadb("how could an AI be used in smart water management systems?"))
        #yield query_chromadb(message)


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)



def main():
    init_chromadb()
    demo.launch()

if __name__ == "__main__":
    main()
    #demo.launch()