Spaces:
Sleeping
Sleeping
File size: 6,061 Bytes
2df9243 7c95914 2df9243 7c95914 2df9243 7c95914 2df9243 7c95914 abd1f1b 2df9243 a98948f 2df9243 7c95914 f8dc8d1 a98948f 403a475 7c95914 f7d3ba1 7c95914 f7d3ba1 cab37f8 2df9243 7c95914 2df9243 7c95914 f9dbffb 7c95914 ffab811 c239258 9ca7d21 2df9243 7c95914 a98948f 9ca7d21 eb1ac12 bcd8992 7c95914 bcd8992 2df9243 bcd8992 586a969 bcd8992 2df9243 ffab811 2df9243 bcd8992 ffab811 2df9243 15fb41d ffab811 2df9243 15fb41d abd1f1b 7c95914 9ca7d21 2df9243 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import os
import pickle
from nltk.tokenize import sent_tokenize
import nltk
from PyPDF2 import PdfReader
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
nltk.download('punkt')
# Sidebar contents
with st.sidebar:
st.title(':orange_book: BinDoc GmbH')
# API key input
api_key = st.text_input('Enter your OpenAI API Key:', type='password')
if api_key:
os.environ['OPENAI_API_KEY'] = api_key
else:
st.warning('API key is required to proceed.')
st.markdown("Experience the future of document interaction with the revolutionary")
st.markdown("**BinDocs Chat App**.")
st.markdown("Harnessing the power of a Large Language Model and AI technology,")
st.markdown("this innovative platform redefines PDF engagement,")
st.markdown("enabling dynamic conversations that bridge the gap between")
st.markdown("human and machine intelligence.")
add_vertical_space(3) # Add more vertical space between text blocks
st.write('Made with ❤️ by BinDoc GmbH')
def load_pdf(file_path):
pdf_reader = PdfReader(file_path)
chunks = []
for page in pdf_reader.pages:
text = page.extract_text()
if text:
chunks.append(text)
store_name = file_path.name[:-4]
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
VectorStore = pickle.load(f)
else:
embeddings = OpenAIEmbeddings()
VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(VectorStore, f)
return VectorStore
def load_chatbot(max_tokens=120):
return load_qa_chain(llm=OpenAI(temperature=0.5, max_tokens=max_tokens), chain_type="stuff")
def display_chat_history(chat_history):
for chat in chat_history:
background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
def remove_incomplete_sentences(text):
sentences = sent_tokenize(text)
complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
return ' '.join(complete_sentences)
def remove_redundant_information(text):
sentences = sent_tokenize(text)
unique_sentences = list(set(sentences))
return ' '.join(unique_sentences)
# Define a maximum token limit to avoid infinite loops
MAX_TOKEN_LIMIT = 400
def main():
st.title("BinDocs Chat App")
if "chat_history" not in st.session_state:
st.session_state['chat_history'] = []
display_chat_history(st.session_state['chat_history'])
st.write("<!-- Start Spacer -->", unsafe_allow_html=True)
st.write("<div style='flex: 1;'></div>", unsafe_allow_html=True)
st.write("<!-- End Spacer -->", unsafe_allow_html=True)
new_messages_placeholder = st.empty()
pdf = st.file_uploader("Upload your PDF", type="pdf")
if pdf is not None:
query = st.text_input("Ask questions about your PDF file (in any preferred language):")
if st.button("Ask") or (query and query != st.session_state.get('last_input', '')):
st.session_state['last_input'] = query
st.session_state['chat_history'].append(("User", query, "new"))
loading_message = st.empty()
loading_message.text('Bot is thinking...')
VectorStore = load_pdf(pdf)
max_tokens = 100
chain = load_chatbot(max_tokens=max_tokens)
docs = VectorStore.similarity_search(query=query, k=2)
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=query)
# Post-processing to remove incomplete sentences and redundant information
filtered_response = remove_incomplete_sentences(response)
filtered_response = remove_redundant_information(filtered_response)
# Check if the response ends with a sentence-ending punctuation
while not filtered_response.strip().endswith(('.', '!', '?')) and max_tokens < MAX_TOKEN_LIMIT:
max_tokens += 50 # Increase the max_tokens limit
chain = load_chatbot(max_tokens=max_tokens)
additional_response = chain.run(input_documents=docs, question=query)
filtered_response += additional_response # Append the additional response to the filtered_response
st.session_state['chat_history'].append(("Bot", filtered_response, "new"))
# Display new messages at the bottom
new_messages = st.session_state['chat_history'][-2:]
for chat in new_messages:
background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
# Scroll to the latest response using JavaScript
st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)
loading_message.empty()
# Clear the input field by setting the query variable to an empty string
query = ""
# Mark all messages as old after displaying
st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
if __name__ == "__main__":
main()
|