Spaces:
Sleeping
Sleeping
File size: 6,683 Bytes
2df9243 9c5d83d 2b04423 7c95914 2df9243 7c95914 2df9243 7c95914 2df9243 7c95914 47f6195 7c95914 abd1f1b 976634b d3de2d8 976634b d3de2d8 976634b a98948f 976634b a98948f 2df9243 7c95914 f8dc8d1 a98948f 403a475 7c95914 f7d3ba1 7c95914 f7d3ba1 cab37f8 2df9243 7c95914 2df9243 7c95914 f9dbffb 7c95914 e38f084 5751fd4 ebf2d4c c239258 9ca7d21 2df9243 2b04423 7c95914 f3ee795 7c95914 9ca7d21 eb1ac12 48a4a46 7c95914 48a4a46 2df9243 bcd8992 586a969 bcd8992 0f37a25 ffab811 2df9243 bcd8992 2b04423 2df9243 15fb41d abd1f1b 48a4a46 abd1f1b 48a4a46 abd1f1b 2b04423 48a4a46 14a29f5 6a479ca 8e976fb 6a479ca 8e976fb 6a479ca 8e976fb 6a479ca 8e976fb 14a29f5 abd1f1b 712c0cb 6f8c132 877c64e 7c95914 71665a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import os
from datasets import load_dataset
import random
import pickle
from nltk.tokenize import sent_tokenize
import nltk
from PyPDF2 import PdfReader
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
nltk.download('punkt')
# Sidebar contents
with st.sidebar:
st.title(':orange_book: BinDoc GmbH')
api_key = os.getenv("OPENAI_API_KEY")
# Retrieve the API key from st.secrets
if not api_key:
st.warning('API key is required to proceed.')
st.stop() # Stop the app if the API key is not provided
st.markdown("Experience the future of document interaction with the revolutionary")
st.markdown("**BinDocs Chat App**.")
st.markdown("Harnessing the power of a Large Language Model and AI technology,")
st.markdown("this innovative platform redefines PDF engagement,")
st.markdown("enabling dynamic conversations that bridge the gap between")
st.markdown("human and machine intelligence.")
add_vertical_space(3) # Add more vertical space between text blocks
st.write('Made with ❤️ by BinDoc GmbH')
def load_pdf(file_path):
pdf_reader = PdfReader(file_path)
chunks = []
for page in pdf_reader.pages:
text = page.extract_text()
if text:
chunks.append(text)
store_name = file_path.name[:-4]
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
VectorStore = pickle.load(f)
else:
embeddings = OpenAIEmbeddings()
VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(VectorStore, f)
return VectorStore
def load_chatbot(max_tokens=300):
return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff")
def display_chat_history(chat_history):
for chat in chat_history:
background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
def remove_incomplete_sentences(text):
sentences = sent_tokenize(text)
complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
return ' '.join(complete_sentences)
def remove_redundant_information(text):
sentences = sent_tokenize(text)
unique_sentences = list(set(sentences))
return ' '.join(unique_sentences)
# Define a maximum token limit to avoid infinite loops
MAX_TOKEN_LIMIT = 400
import random
def main():
st.title("BinDocs Chat App")
# Step 1: Adding CSS for rounded boxes
st.markdown("""
<style>
.question-box {
border: 1px solid orange;
border-radius: 15px;
padding: 10px;
text-align: center;
cursor: pointer;
display: inline-block;
width: 45%;
margin: 2%;
}
</style>
""", unsafe_allow_html=True)
if "chat_history" not in st.session_state:
st.session_state['chat_history'] = []
display_chat_history(st.session_state['chat_history'])
new_messages_placeholder = st.empty()
pdf = st.file_uploader("Upload your PDF", type="pdf")
query = st.text_input("Ask questions about your PDF file (in any preferred language):")
if st.button("Ask") or (query and query != st.session_state.get('last_input', '')):
if pdf is not None:
st.session_state['last_input'] = query
st.session_state['chat_history'].append(("User", query, "new"))
loading_message = st.empty()
loading_message.text('Bot is thinking...')
VectorStore = load_pdf(pdf)
max_tokens = 120
chain = load_chatbot(max_tokens=max_tokens)
docs = VectorStore.similarity_search(query=query, k=2)
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=query)
# Post-processing to remove incomplete sentences and redundant information
filtered_response = remove_incomplete_sentences(response)
filtered_response = remove_redundant_information(filtered_response)
st.session_state['chat_history'].append(("Bot", filtered_response, "new"))
new_messages = st.session_state['chat_history'][-2:]
for chat in new_messages:
background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)
loading_message.empty()
query = ""
else:
st.warning("Please upload a PDF file before asking questions.")
st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
# Displaying example questions
if not st.session_state['chat_history']:
st.markdown("""
<div class="question-box" id="question1">Was genau ist ein Belegarzt?</div>
<div class="question-box" id="question2">Wofür wird die Alpha-ID verwendet?</div>
<br>
<div class="question-box" id="question3">Was sind die Vorteile des ambulanten operierens?</div>
""", unsafe_allow_html=True)
st.markdown("""
<script>
document.getElementById('question1').onclick = function() {
console.log('Question 1 box clicked');
};
document.getElementById('question2').onclick = function() {
console.log('Question 2 box clicked');
};
document.getElementById('question3').onclick = function() {
console.log('Question 3 box clicked');
};
</script>
""", unsafe_allow_html=True)
if __name__ == "__main__":
main()
|