Spaces:
Sleeping
Sleeping
File size: 7,845 Bytes
2df9243 9c5d83d 2b04423 7c95914 2df9243 7c95914 2df9243 7c95914 2df9243 7c95914 47f6195 7c95914 abd1f1b 976634b d3de2d8 976634b d3de2d8 976634b a98948f 976634b a98948f 2df9243 7c95914 f8dc8d1 a98948f 403a475 7c95914 f7d3ba1 7c95914 f7d3ba1 cab37f8 2df9243 7c95914 2df9243 7c95914 f9dbffb 7c95914 e38f084 5751fd4 ebf2d4c c239258 9ca7d21 2df9243 2b04423 7c95914 f3ee795 7c95914 9ca7d21 eb1ac12 bcd8992 7c95914 bcd8992 2df9243 bcd8992 586a969 bcd8992 0f37a25 ffab811 2df9243 bcd8992 2b04423 2df9243 2b04423 15fb41d abd1f1b 2b04423 abd1f1b 7c95914 9ca7d21 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
import os
from datasets import load_dataset
import random
import pickle
from nltk.tokenize import sent_tokenize
import nltk
from PyPDF2 import PdfReader
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
nltk.download('punkt')
# Sidebar contents
with st.sidebar:
st.title(':orange_book: BinDoc GmbH')
api_key = os.getenv("OPENAI_API_KEY")
# Retrieve the API key from st.secrets
if not api_key:
st.warning('API key is required to proceed.')
st.stop() # Stop the app if the API key is not provided
st.markdown("Experience the future of document interaction with the revolutionary")
st.markdown("**BinDocs Chat App**.")
st.markdown("Harnessing the power of a Large Language Model and AI technology,")
st.markdown("this innovative platform redefines PDF engagement,")
st.markdown("enabling dynamic conversations that bridge the gap between")
st.markdown("human and machine intelligence.")
add_vertical_space(3) # Add more vertical space between text blocks
st.write('Made with ❤️ by BinDoc GmbH')
def load_pdf(file_path):
pdf_reader = PdfReader(file_path)
chunks = []
for page in pdf_reader.pages:
text = page.extract_text()
if text:
chunks.append(text)
store_name = file_path.name[:-4]
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
VectorStore = pickle.load(f)
else:
embeddings = OpenAIEmbeddings()
VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(VectorStore, f)
return VectorStore
def load_chatbot(max_tokens=300):
return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff")
def display_chat_history(chat_history):
for chat in chat_history:
background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
def remove_incomplete_sentences(text):
sentences = sent_tokenize(text)
complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
return ' '.join(complete_sentences)
def remove_redundant_information(text):
sentences = sent_tokenize(text)
unique_sentences = list(set(sentences))
return ' '.join(unique_sentences)
# Define a maximum token limit to avoid infinite loops
MAX_TOKEN_LIMIT = 400
import random
def generate_dynamic_question(text_chunks):
# Randomly pick a sentence from the text chunks and frame a question
random_sentence = random.choice(text_chunks)
return f"What is the significance of: '{random_sentence}'?"
def display_example_questions(dynamic_question):
if not st.session_state['chat_history']:
st.markdown("""
<div class="question-box" id="question1">Was genau ist ein Belegarzt?</div>
<div class="question-box" id="question2">Wofür wird die Alpha-ID verwendet?</div>
<br>
<div class="question-box" id="question3">Was sind die Vorteile des ambulanten operierens?</div>
<div class="question-box" id="question4">AI Generated Question: {0}</div>
""".format(dynamic_question), unsafe_allow_html=True)
def main():
st.title("BinDocs Chat App")
# Step 1: Adding CSS for rounded boxes
st.markdown("""
<style>
.question-box {
border: 1px solid orange;
border-radius: 15px;
padding: 10px;
text-align: center;
cursor: pointer;
display: inline-block;
width: 45%;
margin: 2%;
}
</style>
""", unsafe_allow_html=True)
if "chat_history" not in st.session_state:
st.session_state['chat_history'] = []
display_chat_history(st.session_state['chat_history'])
new_messages_placeholder = st.empty()
pdf = st.file_uploader("Upload your PDF", type="pdf")
if pdf is not None:
query = st.text_input("Ask questions about your PDF file (in any preferred language):")
if st.button("Ask") or (query and query != st.session_state.get('last_input', '')):
st.session_state['last_input'] = query
st.session_state['chat_history'].append(("User", query, "new"))
loading_message = st.empty()
loading_message.text('Bot is thinking...')
VectorStore = load_pdf(pdf)
max_tokens = 120
chain = load_chatbot(max_tokens=max_tokens)
docs = VectorStore.similarity_search(query=query, k=2)
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=query)
# Post-processing to remove incomplete sentences and redundant information
filtered_response = remove_incomplete_sentences(response)
filtered_response = remove_redundant_information(filtered_response)
# Dynamic question generation and display example questions
dynamic_question = generate_dynamic_question(chunks)
display_example_questions(dynamic_question)
st.session_state['chat_history'].append(("Bot", filtered_response, "new"))
new_messages = st.session_state['chat_history'][-2:]
for chat in new_messages:
background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)
loading_message.empty()
query = ""
st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
# JavaScript for handling question box clicks
st.markdown("""
<script>
document.getElementById('question1').onclick = function() {
document.querySelector('input').value = 'Was genau ist ein Belegarzt?';
document.querySelector('input').dispatchEvent(new Event('change'));
};
document.getElementById('question2').onclick = function() {
document.querySelector('input').value = 'Wofür wird die Alpha-ID verwendet?';
document.querySelector('input').dispatchEvent(new Event('change'));
};
document.getElementById('question3').onclick = function() {
document.querySelector('input').value = 'Was sind die Vorteile des ambulanten operierens?';
document.querySelector('input').dispatchEvent(new Event('change'));
};
document.getElementById('question4').onclick = function() {
document.querySelector('input').value = 'AI Generated Question: {0}';
document.querySelector('input').dispatchEvent(new Event('change'));
};
</script>
""".format(dynamic_question), unsafe_allow_html=True)
if __name__ == "__main__":
main()
|