Skier8402 commited on
Commit
5bea413
·
verified ·
1 Parent(s): ea3db3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -112
app.py CHANGED
@@ -15,131 +15,94 @@ from langchain.vectorstores import FAISS
15
  from langchain.chat_models import ChatOpenAI
16
  from langchain.memory import ConversationBufferMemory
17
  from langchain.chains import ConversationalRetrievalChain
 
18
  from htmlTemplates import css, bot_template, user_template
19
  from langchain.llms import HuggingFaceHub
20
 
 
 
 
 
 
 
 
21
 
22
  def get_pdf_text(pdf_docs):
23
- """
24
- Extract text from a list of PDF documents.
25
-
26
- Parameters
27
- ----------
28
- pdf_docs : list
29
- List of PDF documents to extract text from.
30
-
31
- Returns
32
- -------
33
- str
34
- Extracted text from all the PDF documents.
35
-
36
- """
37
  text = ""
38
  for pdf in pdf_docs:
39
- pdf_reader = PdfReader(pdf)
40
- for page in pdf_reader.pages:
41
- text += page.extract_text()
 
 
 
42
  return text
43
 
44
-
45
  def get_text_chunks(text):
46
- """
47
- Split the input text into chunks.
48
-
49
- Parameters
50
- ----------
51
- text : str
52
- The input text to be split.
53
-
54
- Returns
55
- -------
56
- list
57
- List of text chunks.
58
-
59
- """
60
  text_splitter = CharacterTextSplitter(
61
  separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
62
  )
63
- chunks = text_splitter.split_text(text)
 
 
 
 
64
  return chunks
65
 
66
-
67
  def get_vectorstore(text_chunks):
68
- """
69
- Generate a vector store from a list of text chunks using HuggingFace BgeEmbeddings.
70
-
71
- Parameters
72
- ----------
73
- text_chunks : list
74
- List of text chunks to be embedded.
75
-
76
- Returns
77
- -------
78
- FAISS
79
- A FAISS vector store containing the embeddings of the text chunks.
80
-
81
- """
82
  model = "BAAI/bge-base-en-v1.5"
83
  encode_kwargs = {
84
  "normalize_embeddings": True
85
- } # set True to compute cosine similarity
86
- embeddings = HuggingFaceBgeEmbeddings(
87
- model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
88
- )
89
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
 
 
 
 
90
  return vectorstore
91
 
92
-
93
  def get_conversation_chain(vectorstore):
94
- """
95
- Create a conversational retrieval chain using a vector store and a language model.
96
-
97
- Parameters
98
- ----------
99
- vectorstore : FAISS
100
- A FAISS vector store containing the embeddings of the text chunks.
101
-
102
- Returns
103
- -------
104
- ConversationalRetrievalChain
105
- A conversational retrieval chain for generating responses.
106
-
107
- """
108
- llm = HuggingFaceHub(
109
- repo_id="mistralai/Mistral-7B-v0.3",
110
- model_kwargs={"temperature": 0.5, "max_length": 4000},
111
- )
112
- # llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
113
 
114
- memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
115
- conversation_chain = ConversationalRetrievalChain.from_llm(
116
- llm=llm, retriever=vectorstore.as_retriever(), memory=memory
117
- )
 
 
 
 
 
 
 
 
118
  return conversation_chain
119
 
120
-
121
  def handle_userinput(user_question):
122
- """
123
- Handle user input and generate a response using the conversational retrieval chain.
124
- Parameters
125
- ----------
126
- user_question : str
127
- The user's question.
128
- """
129
- response = st.session_state.conversation({"question": user_question})
130
- st.session_state.chat_history = response["chat_history"]
131
-
132
- for i, message in enumerate(st.session_state.chat_history):
133
- if i % 2 == 0:
134
- st.write("//_^ User: " + message.content)
135
- else:
136
- st.write("🤖 ChatBot: " + message.content)
137
 
 
 
 
 
 
 
 
138
 
139
  def main():
140
- """
141
- Putting it all together.
142
- """
143
  st.set_page_config(
144
  page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
145
  page_icon=":books:",
@@ -150,15 +113,13 @@ def main():
150
 
151
  st.write(css, unsafe_allow_html=True)
152
 
153
- # set huggingface hub token in st.text_input widget
154
- # then hide the input
155
  huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password")
156
  #openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
157
 
158
- # set this key as an environment variable
159
- os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
160
- #os.environ["OPENAI_API_KEY"] = openai_api_key
161
-
162
 
163
  if "conversation" not in st.session_state:
164
  st.session_state.conversation = None
@@ -177,18 +138,20 @@ def main():
177
  )
178
  if st.button("Process"):
179
  with st.spinner("Processing"):
180
- # get pdf text
181
- raw_text = get_pdf_text(pdf_docs)
182
-
183
- # get the text chunks
184
- text_chunks = get_text_chunks(raw_text)
185
 
186
- # create vector store
187
- vectorstore = get_vectorstore(text_chunks)
188
 
189
- # create conversation chain
190
- st.session_state.conversation = get_conversation_chain(vectorstore)
191
 
 
 
 
 
192
 
193
  if __name__ == "__main__":
194
- main()
 
15
  from langchain.chat_models import ChatOpenAI
16
  from langchain.memory import ConversationBufferMemory
17
  from langchain.chains import ConversationalRetrievalChain
18
+ from langchain.schema import BaseOutputParser, OutputParserException
19
  from htmlTemplates import css, bot_template, user_template
20
  from langchain.llms import HuggingFaceHub
21
 
22
+ class ReferenceOutputParser(BaseOutputParser):
23
+ def parse(self, text: str) -> dict:
24
+ try:
25
+ result, references = text.split("References:")
26
+ return {"result": result.strip(), "references": [ref.strip() for ref in references.split("\n") if ref.strip()]}
27
+ except ValueError:
28
+ raise OutputParserException(f"Could not parse output: {text}")
29
 
30
  def get_pdf_text(pdf_docs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  text = ""
32
  for pdf in pdf_docs:
33
+ try:
34
+ pdf_reader = PdfReader(pdf)
35
+ for page in pdf_reader.pages:
36
+ text += page.extract_text()
37
+ except Exception as e:
38
+ st.error(f"Error extracting text from PDF: {e}")
39
  return text
40
 
 
41
  def get_text_chunks(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  text_splitter = CharacterTextSplitter(
43
  separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
44
  )
45
+ try:
46
+ chunks = text_splitter.split_text(text)
47
+ except Exception as e:
48
+ st.error(f"Error splitting text into chunks: {e}")
49
+ chunks = []
50
  return chunks
51
 
 
52
  def get_vectorstore(text_chunks):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  model = "BAAI/bge-base-en-v1.5"
54
  encode_kwargs = {
55
  "normalize_embeddings": True
56
+ }
57
+ try:
58
+ embeddings = HuggingFaceBgeEmbeddings(
59
+ model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
60
+ )
61
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
62
+ except Exception as e:
63
+ st.error(f"Error creating vector store: {e}")
64
+ vectorstore = None
65
  return vectorstore
66
 
 
67
  def get_conversation_chain(vectorstore):
68
+ if vectorstore is None:
69
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ try:
72
+ llm = HuggingFaceHub(
73
+ repo_id="mistralai/Mistral-7B-v0.3",
74
+ model_kwargs={"temperature": 0.5, "max_length": 4000},
75
+ )
76
+ memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
77
+ conversation_chain = ConversationalRetrievalChain.from_llm(
78
+ llm=llm, retriever=vectorstore.as_retriever(), memory=memory, output_parser=ReferenceOutputParser()
79
+ )
80
+ except Exception as e:
81
+ st.error(f"Error creating conversation chain: {e}")
82
+ conversation_chain = None
83
  return conversation_chain
84
 
 
85
  def handle_userinput(user_question):
86
+ if st.session_state.conversation is None:
87
+ st.error("Please process the PDF files before asking a question.")
88
+ return
89
+
90
+ try:
91
+ response = st.session_state.conversation({"question": user_question})
92
+ st.session_state.chat_history = response["chat_history"]
93
+
94
+ result = response["result"]
95
+ references = response["references"]
 
 
 
 
 
96
 
97
+ st.write("//_^ User: " + user_question)
98
+ st.write("🤖 ChatBot: " + result)
99
+ st.write("References:")
100
+ for ref in references:
101
+ st.write("- " + ref)
102
+ except Exception as e:
103
+ st.error(f"Error handling user input: {e}")
104
 
105
  def main():
 
 
 
106
  st.set_page_config(
107
  page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
108
  page_icon=":books:",
 
113
 
114
  st.write(css, unsafe_allow_html=True)
115
 
 
 
116
  huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password")
117
  #openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
118
 
119
+ if huggingface_token:
120
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
121
+ #if openai_api_key:
122
+ # os.environ["OPENAI_API_KEY"] = openai_api_key
123
 
124
  if "conversation" not in st.session_state:
125
  st.session_state.conversation = None
 
138
  )
139
  if st.button("Process"):
140
  with st.spinner("Processing"):
141
+ try:
142
+ # get pdf text
143
+ raw_text = get_pdf_text(pdf_docs)
 
 
144
 
145
+ # get the text chunks
146
+ text_chunks = get_text_chunks(raw_text)
147
 
148
+ # create vector store
149
+ vectorstore = get_vectorstore(text_chunks)
150
 
151
+ # create conversation chain
152
+ st.session_state.conversation = get_conversation_chain(vectorstore)
153
+ except Exception as e:
154
+ st.error(f"Error processing PDF files: {e}")
155
 
156
  if __name__ == "__main__":
157
+ main()