LIDA2_csv

Sleeping

App Files Files Community

LIDA2_csv / app.py

Anne31415

Update app.py

f67a1cc about 2 years ago

raw

history blame

6.07 kB

	import os
	from huggingface_hub import Repository
	import streamlit.components.v1 as components
	from datasets import load_dataset
	import random
	import pickle
	from nltk.tokenize import sent_tokenize
	import nltk
	from PyPDF2 import PdfReader
	import streamlit as st
	from streamlit_extras.add_vertical_space import add_vertical_space
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.llms import OpenAI
	from langchain.chains.question_answering import load_qa_chain
	from langchain.callbacks import get_openai_callback
	from my_component import my_component

	nltk.download('punkt')

	# Step 1: Clone the Dataset Repository
	repo = Repository(
	local_dir="Private_Book", # Local directory to clone the repository
	repo_type="dataset", # Specify that this is a dataset repository

	clone_from="Anne31415/Private_Book", # Replace with your repository URL

	token=os.environ["HUB_TOKEN"] # Use the secret token to authenticate
	)
	repo.git_pull() # Pull the latest changes (if any)

	# Step 2: Load the PDF File
	pdf_file_path = "Private_Book/Glossar_PDF_webscraping.pdf" # Replace with your PDF file path


	# Sidebar contents
	with st.sidebar:
	st.title(':orange_book: BinDoc GmbH')


	api_key = os.getenv("OPENAI_API_KEY")
	# Retrieve the API key from st.secrets


	if not api_key:
	st.warning('API key is required to proceed.')
	st.stop() # Stop the app if the API key is not provided

	st.markdown("Experience the future of document interaction with the revolutionary")
	st.markdown("BinDocs Chat App.")
	st.markdown("Harnessing the power of a Large Language Model and AI technology,")
	st.markdown("this innovative platform redefines PDF engagement,")
	st.markdown("enabling dynamic conversations that bridge the gap between")
	st.markdown("human and machine intelligence.")

	add_vertical_space(3) # Add more vertical space between text blocks
	st.write('Made with ❤️ by BinDoc GmbH')

	def load_pdf(file_path):
	pdf_reader = PdfReader(file_path)
	chunks = []
	for page in pdf_reader.pages:
	text = page.extract_text()
	if text:
	chunks.append(text)

	store_name = os.path.basename(file_path)[:-4]

	if os.path.exists(f"{store_name}.pkl"):
	with open(f"{store_name}.pkl", "rb") as f:
	VectorStore = pickle.load(f)
	else:
	embeddings = OpenAIEmbeddings()
	VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
	with open(f"{store_name}.pkl", "wb") as f:
	pickle.dump(VectorStore, f)

	return VectorStore


	def load_chatbot(max_tokens=300):
	return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff")


	def display_chat_history(chat_history):
	for chat in chat_history:
	background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
	st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

	def remove_incomplete_sentences(text):
	sentences = sent_tokenize(text)
	complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
	return ' '.join(complete_sentences)

	def remove_redundant_information(text):
	sentences = sent_tokenize(text)
	unique_sentences = list(set(sentences))
	return ' '.join(unique_sentences)

	# Define a maximum token limit to avoid infinite loops
	MAX_TOKEN_LIMIT = 400

	import random


	def main():
	st.title("BinDocs Chat App")

	if "chat_history" not in st.session_state:
	st.session_state['chat_history'] = []

	display_chat_history(st.session_state['chat_history'])

	new_messages_placeholder = st.empty()

	query = st.text_input("Ask questions about your PDF file (in any preferred language):")

	if st.button("Was genau ist ein Belegarzt?"):
	query = "Was genau ist ein Belegarzt?"
	if st.button("Wofür wird die Alpha-ID verwendet?"):
	query = "Wofür wird die Alpha-ID verwendet?"
	if st.button("Was sind die Vorteile des ambulanten operierens?"):
	query = "Was sind die Vorteile des ambulanten operierens?"

	if query:
	st.session_state['last_input'] = query
	st.session_state['chat_history'].append(("User", query, "new"))

	loading_message = st.empty()
	loading_message.text('Bot is thinking...')

	VectorStore = load_pdf(pdf_file_path)
	max_tokens = 120
	chain = load_chatbot(max_tokens=max_tokens)
	docs = VectorStore.similarity_search(query=query, k=2)

	with get_openai_callback() as cb:
	response = chain.run(input_documents=docs, question=query)

	# Post-processing to remove incomplete sentences and redundant information
	filtered_response = remove_incomplete_sentences(response)
	filtered_response = remove_redundant_information(filtered_response)

	st.session_state['chat_history'].append(("Bot", filtered_response, "new"))

	new_messages = st.session_state['chat_history'][-2:]
	for chat in new_messages:
	background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
	new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

	st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)

	loading_message.empty()

	query = ""
	else:
	st.warning("Please enter a query before asking questions.")

	st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]


	if __name__ == "__main__":
	main()