Anne31415 commited on
Commit
2df9243
·
1 Parent(s): 15fb41d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -32
app.py CHANGED
@@ -1,7 +1,9 @@
1
- import streamlit as st
2
- import re
3
  import pickle
 
 
4
  from PyPDF2 import PdfReader
 
5
  from streamlit_extras.add_vertical_space import add_vertical_space
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
  from langchain.embeddings.openai import OpenAIEmbeddings
@@ -9,13 +11,14 @@ from langchain.vectorstores import FAISS
9
  from langchain.llms import OpenAI
10
  from langchain.chains.question_answering import load_qa_chain
11
  from langchain.callbacks import get_openai_callback
12
- import os
 
13
 
14
  # Sidebar contents
15
  with st.sidebar:
16
  st.title(':orange_book: BinDoc GmbH')
17
 
18
- # API key input (this will not display the entered text)
19
  api_key = st.text_input('Enter your OpenAI API Key:', type='password')
20
 
21
  if api_key:
@@ -23,22 +26,15 @@ with st.sidebar:
23
  else:
24
  st.warning('API key is required to proceed.')
25
 
26
- st.markdown(
27
- "Experience the future of document interaction with the revolutionary"
28
- )
29
-
30
  st.markdown("**BinDocs Chat App**.")
31
-
32
  st.markdown("Harnessing the power of a Large Language Model and AI technology,")
33
-
34
  st.markdown("this innovative platform redefines PDF engagement,")
35
-
36
  st.markdown("enabling dynamic conversations that bridge the gap between")
37
  st.markdown("human and machine intelligence.")
38
 
39
  add_vertical_space(3) # Add more vertical space between text blocks
40
  st.write('Made with ❤️ by BinDoc GmbH')
41
-
42
 
43
  def load_pdf(file_path):
44
  pdf_reader = PdfReader(file_path)
@@ -49,28 +45,39 @@ def load_pdf(file_path):
49
  chunks.append(text)
50
 
51
  store_name = file_path.name[:-4]
52
-
53
  if os.path.exists(f"{store_name}.pkl"):
54
  with open(f"{store_name}.pkl", "rb") as f:
55
  VectorStore = pickle.load(f)
56
  else:
57
- embeddings = OpenAIEmbeddings() # No api_key parameter here
58
  VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
59
  with open(f"{store_name}.pkl", "wb") as f:
60
  pickle.dump(VectorStore, f)
61
 
62
  return VectorStore
63
 
64
-
65
  def load_chatbot(max_tokens=120):
66
  return load_qa_chain(llm=OpenAI(temperature=0.5, max_tokens=max_tokens), chain_type="stuff")
67
 
68
-
69
  def display_chat_history(chat_history):
70
  for chat in chat_history:
71
  background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
72
  st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def main():
75
  st.title("BinDocs Chat App")
76
 
@@ -91,36 +98,33 @@ def main():
91
  query = st.text_input("Ask questions about your PDF file (in any preferred language):")
92
 
93
  if st.button("Ask") or (query and query != st.session_state.get('last_input', '')):
94
- st.session_state['last_input'] = query # Save the current query as the last input
95
  st.session_state['chat_history'].append(("User", query, "new"))
96
 
97
  loading_message = st.empty()
98
  loading_message.text('Bot is thinking...')
99
 
100
  VectorStore = load_pdf(pdf)
101
- max_tokens = 100 # Initial max tokens
102
  chain = load_chatbot(max_tokens=max_tokens)
103
- docs = VectorStore.similarity_search(query=query, k=1)
 
104
  with get_openai_callback() as cb:
105
  response = chain.run(input_documents=docs, question=query)
106
-
107
 
108
- # Filtering similar responses (a simple example using set to remove duplicate sentences)
109
- response_sentences = response.split('. ')
110
- unique_sentences = set(response_sentences)
111
- filtered_response = '. '.join(unique_sentences)
112
-
113
- # Check if the filtered response ends with a sentence-ending punctuation
114
  while not filtered_response.strip().endswith(('.', '!', '?')) and max_tokens < MAX_TOKEN_LIMIT:
115
  max_tokens += 50 # Increase the max_tokens limit
116
  chain = load_chatbot(max_tokens=max_tokens)
117
  additional_response = chain.run(input_documents=docs, question=query)
118
- filtered_response += additional_response # Append the additional response to the filtered response
119
 
120
  st.session_state['chat_history'].append(("Bot", filtered_response, "new"))
121
 
122
-
123
-
124
  # Display new messages at the bottom
125
  new_messages = st.session_state['chat_history'][-2:]
126
  for chat in new_messages:
@@ -138,8 +142,6 @@ def main():
138
  # Mark all messages as old after displaying
139
  st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
140
 
141
- # Define a maximum token limit to avoid infinite loops
142
- MAX_TOKEN_LIMIT = 400
143
-
144
  if __name__ == "__main__":
145
  main()
 
 
1
+ import os
 
2
  import pickle
3
+ from nltk.tokenize import sent_tokenize
4
+ import nltk
5
  from PyPDF2 import PdfReader
6
+ import streamlit as st
7
  from streamlit_extras.add_vertical_space import add_vertical_space
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.embeddings.openai import OpenAIEmbeddings
 
11
  from langchain.llms import OpenAI
12
  from langchain.chains.question_answering import load_qa_chain
13
  from langchain.callbacks import get_openai_callback
14
+
15
+ nltk.download('punkt')
16
 
17
  # Sidebar contents
18
  with st.sidebar:
19
  st.title(':orange_book: BinDoc GmbH')
20
 
21
+ # API key input
22
  api_key = st.text_input('Enter your OpenAI API Key:', type='password')
23
 
24
  if api_key:
 
26
  else:
27
  st.warning('API key is required to proceed.')
28
 
29
+ st.markdown("Experience the future of document interaction with the revolutionary")
 
 
 
30
  st.markdown("**BinDocs Chat App**.")
 
31
  st.markdown("Harnessing the power of a Large Language Model and AI technology,")
 
32
  st.markdown("this innovative platform redefines PDF engagement,")
 
33
  st.markdown("enabling dynamic conversations that bridge the gap between")
34
  st.markdown("human and machine intelligence.")
35
 
36
  add_vertical_space(3) # Add more vertical space between text blocks
37
  st.write('Made with ❤️ by BinDoc GmbH')
 
38
 
39
  def load_pdf(file_path):
40
  pdf_reader = PdfReader(file_path)
 
45
  chunks.append(text)
46
 
47
  store_name = file_path.name[:-4]
48
+
49
  if os.path.exists(f"{store_name}.pkl"):
50
  with open(f"{store_name}.pkl", "rb") as f:
51
  VectorStore = pickle.load(f)
52
  else:
53
+ embeddings = OpenAIEmbeddings()
54
  VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
55
  with open(f"{store_name}.pkl", "wb") as f:
56
  pickle.dump(VectorStore, f)
57
 
58
  return VectorStore
59
 
 
60
  def load_chatbot(max_tokens=120):
61
  return load_qa_chain(llm=OpenAI(temperature=0.5, max_tokens=max_tokens), chain_type="stuff")
62
 
 
63
  def display_chat_history(chat_history):
64
  for chat in chat_history:
65
  background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
66
  st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
67
 
68
+ def remove_incomplete_sentences(text):
69
+ sentences = sent_tokenize(text)
70
+ complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
71
+ return ' '.join(complete_sentences)
72
+
73
+ def remove_redundant_information(text):
74
+ sentences = sent_tokenize(text)
75
+ unique_sentences = list(set(sentences))
76
+ return ' '.join(unique_sentences)
77
+
78
+ # Define a maximum token limit to avoid infinite loops
79
+ MAX_TOKEN_LIMIT = 400
80
+
81
  def main():
82
  st.title("BinDocs Chat App")
83
 
 
98
  query = st.text_input("Ask questions about your PDF file (in any preferred language):")
99
 
100
  if st.button("Ask") or (query and query != st.session_state.get('last_input', '')):
101
+ st.session_state['last_input'] = query
102
  st.session_state['chat_history'].append(("User", query, "new"))
103
 
104
  loading_message = st.empty()
105
  loading_message.text('Bot is thinking...')
106
 
107
  VectorStore = load_pdf(pdf)
108
+ max_tokens = 100
109
  chain = load_chatbot(max_tokens=max_tokens)
110
+ docs = VectorStore.similarity_search(query=query, k=2)
111
+
112
  with get_openai_callback() as cb:
113
  response = chain.run(input_documents=docs, question=query)
 
114
 
115
+ # Post-processing to remove incomplete sentences and redundant information
116
+ filtered_response = remove_incomplete_sentences(response)
117
+ filtered_response = remove_redundant_information(filtered_response)
118
+
119
+ # Check if the response ends with a sentence-ending punctuation
 
120
  while not filtered_response.strip().endswith(('.', '!', '?')) and max_tokens < MAX_TOKEN_LIMIT:
121
  max_tokens += 50 # Increase the max_tokens limit
122
  chain = load_chatbot(max_tokens=max_tokens)
123
  additional_response = chain.run(input_documents=docs, question=query)
124
+ filtered_response += additional_response # Append the additional response to the filtered_response
125
 
126
  st.session_state['chat_history'].append(("Bot", filtered_response, "new"))
127
 
 
 
128
  # Display new messages at the bottom
129
  new_messages = st.session_state['chat_history'][-2:]
130
  for chat in new_messages:
 
142
  # Mark all messages as old after displaying
143
  st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
144
 
 
 
 
145
  if __name__ == "__main__":
146
  main()
147
+