LIDA2_csv

Sleeping

App Files Files Community

LIDA2_csv / app.py

Anne31415

Update app.py

d3de2d8 about 2 years ago

raw

history blame

6.07 kB

	import os
	import pickle
	from nltk.tokenize import sent_tokenize
	import nltk
	from PyPDF2 import PdfReader
	import streamlit as st
	from streamlit_extras.add_vertical_space import add_vertical_space
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.llms import OpenAI
	from langchain.chains.question_answering import load_qa_chain
	from langchain.callbacks import get_openai_callback

	nltk.download('punkt')

	# Sidebar contents
	with st.sidebar:
	st.title(':orange_book: BinDoc GmbH')


	api_key = os.getenv("OPENAI_API_KEY")
	# Retrieve the API key from st.secrets


	if not api_key:
	st.warning('API key is required to proceed.')
	st.stop() # Stop the app if the API key is not provided

	st.markdown("Experience the future of document interaction with the revolutionary")
	st.markdown("BinDocs Chat App.")
	st.markdown("Harnessing the power of a Large Language Model and AI technology,")
	st.markdown("this innovative platform redefines PDF engagement,")
	st.markdown("enabling dynamic conversations that bridge the gap between")
	st.markdown("human and machine intelligence.")

	add_vertical_space(3) # Add more vertical space between text blocks
	st.write('Made with ❤️ by BinDoc GmbH')

	def load_pdf(file_path):
	pdf_reader = PdfReader(file_path)
	chunks = []
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text:
	chunks.append(text)

	store_name = file_path.name[:-4]

	if os.path.exists(f"{store_name}.pkl"):
	with open(f"{store_name}.pkl", "rb") as f:
	VectorStore = pickle.load(f)
	else:
	embeddings = OpenAIEmbeddings()
	VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
	with open(f"{store_name}.pkl", "wb") as f:
	pickle.dump(VectorStore, f)

	return VectorStore

	def load_chatbot(max_tokens=300):
	return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff")


	def display_chat_history(chat_history):
	for chat in chat_history:
	background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
	st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

	def remove_incomplete_sentences(text):
	sentences = sent_tokenize(text)
	complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
	return ' '.join(complete_sentences)

	def remove_redundant_information(text):
	sentences = sent_tokenize(text)
	unique_sentences = list(set(sentences))
	return ' '.join(unique_sentences)

	# Define a maximum token limit to avoid infinite loops
	MAX_TOKEN_LIMIT = 400

	def main():
	st.title("BinDocs Chat App")

	if "chat_history" not in st.session_state:
	st.session_state['chat_history'] = []

	display_chat_history(st.session_state['chat_history'])

	st.write("<!-- Start Spacer -->", unsafe_allow_html=True)
	st.write("<div style='flex: 1;'></div>", unsafe_allow_html=True)
	st.write("<!-- End Spacer -->", unsafe_allow_html=True)

	new_messages_placeholder = st.empty()

	pdf = st.file_uploader("Upload your PDF", type="pdf")

	if pdf is not None:
	query = st.text_input("Ask questions about your PDF file (in any preferred language):")

	if st.button("Ask") or (query and query != st.session_state.get('last_input', '')):
	st.session_state['last_input'] = query
	st.session_state['chat_history'].append(("User", query, "new"))

	loading_message = st.empty()
	loading_message.text('Bot is thinking...')

	VectorStore = load_pdf(pdf)
	max_tokens = 120
	chain = load_chatbot(max_tokens=max_tokens)
	docs = VectorStore.similarity_search(query=query, k=2)

	with get_openai_callback() as cb:
	response = chain.run(input_documents=docs, question=query)

	# Post-processing to remove incomplete sentences and redundant information
	filtered_response = remove_incomplete_sentences(response)
	filtered_response = remove_redundant_information(filtered_response)

	# Check if the response ends with a sentence-ending punctuation
	while not filtered_response.strip().endswith(('.', '!', '?')) and max_tokens < MAX_TOKEN_LIMIT:
	max_tokens += 100 # Increase the max_tokens limit
	chain = load_chatbot(max_tokens=max_tokens)
	additional_response = chain.run(input_documents=docs, question=query)
	filtered_response += additional_response # Append the additional response to the filtered_response

	st.session_state['chat_history'].append(("Bot", filtered_response, "new"))

	# Display new messages at the bottom
	new_messages = st.session_state['chat_history'][-2:]
	for chat in new_messages:
	background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
	new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

	# Scroll to the latest response using JavaScript
	st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)

	loading_message.empty()

	# Clear the input field by setting the query variable to an empty string
	query = ""

	# Mark all messages as old after displaying
	st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]

	if __name__ == "__main__":
	main()