HarshaBattula commited on
Commit
a9e9e50
·
1 Parent(s): b91232a

adding gpt-3.5 based retrieval augmented system

Browse files
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from langchain.vectorstores import Chroma
3
+ from langchain.embeddings import OpenAIEmbeddings
4
+ from retriever import *
5
+ from chain import *
6
+ import gradio as gr
7
+
8
+ def chatbot(query):
9
+ llm_response = qa_chain.run({"query": query})
10
+ return llm_response
11
+
12
+
13
+ def load_embeddings_database_from_disk(persistence_directory, embeddings_generator):
14
+ """
15
+ Load a Chroma vector database from disk.
16
+
17
+ This function loads a Chroma vector database from the specified directory on disk.
18
+ It expects the same persistence_directory and embedding function as used when creating the database.
19
+
20
+ Args:
21
+ persistence_directory (str): The directory where the database is stored on disk.
22
+ embeddings_generator (obj): The embeddings generator function that was used when creating the database.
23
+
24
+ Returns:
25
+ vector_database (obj): The loaded Chroma vector database.
26
+ """
27
+
28
+ # Load the Chroma vector database from the persistence directory.
29
+ # The embedding_function parameter should be the same as the one used when the database was created.
30
+ vector_database = Chroma(persist_directory=persistence_directory, embedding_function=embeddings_generator)
31
+
32
+ return vector_database
33
+
34
+
35
+
36
+ # Specify the directory where the database will be stored when it's persisted.
37
+ persistence_directory = 'db'
38
+ # Create and persist the embeddings for the documents.
39
+ embeddings_generator = OpenAIEmbeddings(openai_api_key = openai.api_key)
40
+ # Load the Chroma vector database from disk.
41
+ vector_database = load_embeddings_database_from_disk(persistence_directory, embeddings_generator)
42
+ topk_documents = 2
43
+ # Creating the retriever on top documents.
44
+ retriever = initialize_document_retriever(topk_documents, vector_database)
45
+ qa_chain = create_question_answering_chain(retriever)
46
+
47
+
48
+ inputs = gr.inputs.Textbox(lines=7, label="Coversational Interface with Chat history")
49
+ outputs = gr.outputs.Textbox(label="Reply")
50
+
51
+ gr.Interface(fn=chatbot, inputs=inputs, outputs=outputs, title="Retrieval Augmented Question Answering",
52
+ show_progress = True, theme="compact").launch(share = True, debug=True)
chain.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.memory import ConversationBufferMemory
2
+ from langchain import PromptTemplate
3
+ from langchain.chat_models import ChatOpenAI
4
+ from langchain.chains import RetrievalQA
5
+ import openai
6
+
7
+ openai.api_key = "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"
8
+
9
+ def create_juniper_prompt_template():
10
+ template = """You are a network engineer from Juniper Networks not a Language Model, use your knowledge, and the some pieces of context (delimited by <ctx></ctx>) to answer the user's question. \n Try to pretend as if you are a member of Juniper Networks. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.
11
+ Do not indicate that you have access to any context.
12
+ Use the chat history (delimited by <hs></hs>) to keep track of the conversation.
13
+ \n----------------\n
14
+ <ctx>
15
+ {context}
16
+ </ctx>
17
+ \n----------------\n
18
+ ------
19
+ <hs>
20
+ {history}
21
+ </hs>
22
+ ------
23
+ {question}
24
+ Answer:
25
+ """
26
+
27
+ juniper_prompt_template = PromptTemplate(input_variables=["history", "context", "question"], template=template)
28
+ return juniper_prompt_template
29
+
30
+
31
+
32
+ def create_question_answering_chain(retriever):
33
+ """
34
+ Create a retrieval question answering (QA) chain.
35
+
36
+ This function initializes a QA chain that can be used to answer questions based on retrieved documents.
37
+ It uses the OpenAI 'gpt-3.5-turbo' model for the language model (LLM), and a document retriever for finding
38
+ relevant documents.
39
+
40
+ Args:
41
+ retriever (obj): The document retriever to use for finding relevant documents.
42
+
43
+ Returns:
44
+ qa_chain (obj): The initialized retrieval QA chain.
45
+ """
46
+ # Initialize the OpenAI language model with specified temperature, model name, and API key.
47
+ turbo_llm = ChatOpenAI(
48
+ temperature=0,
49
+ model_name='gpt-3.5-turbo',
50
+ openai_api_key = openai.api_key
51
+ )
52
+
53
+ # Initialize the retrieval QA chain with the language model, chain type, document retriever,
54
+ # and a flag indicating whether to return source documents.
55
+ qa_chain = RetrievalQA.from_chain_type(
56
+ llm=turbo_llm,
57
+ chain_type='stuff',
58
+ retriever=retriever,
59
+ verbose=False,
60
+ chain_type_kwargs={
61
+ "verbose": False,
62
+ "prompt": create_juniper_prompt_template(),
63
+ "memory": ConversationBufferMemory(
64
+ memory_key="history",
65
+ input_key="question")
66
+ }
67
+ )
68
+
69
+
70
+ return qa_chain
db/chroma-collections.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d11f275e47d9d5a2bb0acb41c5746868e7288b0436871abe793fbd1679064d5e
3
+ size 557
db/chroma-embeddings.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eac62ca72d3b72a738519d1fb159052c35a8b43284974729b737525c88d920c
3
+ size 244539180
db/index/id_to_uuid_d25f8acb-f4d6-4b67-b80a-9b85ac72b87c.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:874a7a333254e6fc9fc426e74068ae585acad068eb18f40ad8781b206f30a778
3
+ size 641398
db/index/index_d25f8acb-f4d6-4b67-b80a-9b85ac72b87c.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff35d3c91148d593e15d8d18684b1841f5008db2bd664418736fec5b93f2531
3
+ size 124197200
db/index/index_metadata_d25f8acb-f4d6-4b67-b80a-9b85ac72b87c.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1da962bf05d46a551c527af9a099dbbcddd739579e064b36fed531e14faeb5dc
3
+ size 105
db/index/uuid_to_id_d25f8acb-f4d6-4b67-b80a-9b85ac72b87c.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:878e7943e4bbdd6b6b5963aa63441f165dce733a0d6a3304e839ce2231f63246
3
+ size 749904
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ tiktoken
4
+ chromadb
5
+ langchain
6
+ pypdf
7
+ gradio
retriever.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def initialize_document_retriever(top_k_documents, vector_database):
2
+ """
3
+ Initialize a document retriever using a Chroma vector database.
4
+
5
+ This function initializes a document retriever that can be used to find and retrieve the most relevant documents
6
+ for a specified search query. The number of documents to retrieve is determined by the top_k_documents parameter.
7
+
8
+ Args:
9
+ top_k_documents (int): The number of top relevant documents to retrieve.
10
+ vector_database (obj): The Chroma vector database to use for retrieving documents.
11
+
12
+ Returns:
13
+ document_retriever (obj): The initialized document retriever.
14
+ """
15
+
16
+ # Initialize the document retriever with the Chroma vector database and the number of documents to retrieve.
17
+ document_retriever = vector_database.as_retriever(
18
+ search_kwargs = {"k": top_k_documents}
19
+ )
20
+
21
+ return document_retriever
22
+
23
+ def retrieve_relevant_documents(search_query, document_retriever):
24
+ """
25
+ Retrieve the most relevant documents for a given query.
26
+
27
+ This function uses an initialized document retriever to find and retrieve the most relevant documents
28
+ for a specified search query.
29
+
30
+ Args:
31
+ search_query (str): The search query for which to find and retrieve relevant documents.
32
+ document_retriever (obj): The initialized document retriever.
33
+
34
+ Returns:
35
+ relevant_documents (list): The list of most relevant documents for the search query.
36
+ """
37
+
38
+ # Retrieve the most relevant documents for the search query using the document retriever.
39
+ relevant_documents = document_retriever.get_relevant_documents(search_query)
40
+
41
+ return relevant_documents