File size: 7,845 Bytes
2df9243
9c5d83d
2b04423
7c95914
2df9243
 
7c95914
2df9243
7c95914
 
 
 
 
 
 
2df9243
 
7c95914
47f6195
 
7c95914
 
abd1f1b
 
976634b
d3de2d8
976634b
d3de2d8
976634b
 
a98948f
976634b
a98948f
2df9243
7c95914
 
 
 
 
 
 
f8dc8d1
a98948f
403a475
7c95914
f7d3ba1
7c95914
f7d3ba1
 
 
 
cab37f8
2df9243
7c95914
 
 
 
2df9243
7c95914
 
 
 
f9dbffb
7c95914
e38f084
5751fd4
ebf2d4c
c239258
9ca7d21
 
 
 
 
2df9243
 
 
 
 
 
 
 
 
 
 
 
 
2b04423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c95914
 
 
f3ee795
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c95914
 
 
 
 
9ca7d21
 
eb1ac12
 
 
bcd8992
7c95914
bcd8992
2df9243
bcd8992
586a969
bcd8992
 
 
 
0f37a25
ffab811
2df9243
 
bcd8992
 
2b04423
2df9243
 
 
 
2b04423
 
 
15fb41d
 
 
abd1f1b
 
 
 
 
 
 
 
 
 
 
 
2b04423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abd1f1b
7c95914
9ca7d21
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import os
from datasets import load_dataset
import random
import pickle
from nltk.tokenize import sent_tokenize
import nltk
from PyPDF2 import PdfReader
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback

nltk.download('punkt')



# Sidebar contents
with st.sidebar:
    st.title(':orange_book: BinDoc GmbH')
    

    api_key = os.getenv("OPENAI_API_KEY")
    # Retrieve the API key from st.secrets
    

    if not api_key:
        st.warning('API key is required to proceed.')
        st.stop()  # Stop the app if the API key is not provided

    st.markdown("Experience the future of document interaction with the revolutionary")
    st.markdown("**BinDocs Chat App**.")
    st.markdown("Harnessing the power of a Large Language Model and AI technology,")
    st.markdown("this innovative platform redefines PDF engagement,")
    st.markdown("enabling dynamic conversations that bridge the gap between")
    st.markdown("human and machine intelligence.")

    add_vertical_space(3)  # Add more vertical space between text blocks
    st.write('Made with ❤️ by BinDoc GmbH')

def load_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    chunks = []
    for page in pdf_reader.pages:
        text = page.extract_text()
        if text:
            chunks.append(text)
    
    store_name = file_path.name[:-4]
    
    if os.path.exists(f"{store_name}.pkl"):
        with open(f"{store_name}.pkl", "rb") as f:
            VectorStore = pickle.load(f)
    else:
        embeddings = OpenAIEmbeddings()
        VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
        with open(f"{store_name}.pkl", "wb") as f:
            pickle.dump(VectorStore, f)

    return VectorStore

def load_chatbot(max_tokens=300):
    return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff") 
    

def display_chat_history(chat_history):
    for chat in chat_history:
        background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
        st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

def remove_incomplete_sentences(text):
    sentences = sent_tokenize(text)
    complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
    return ' '.join(complete_sentences)

def remove_redundant_information(text):
    sentences = sent_tokenize(text)
    unique_sentences = list(set(sentences))
    return ' '.join(unique_sentences)

# Define a maximum token limit to avoid infinite loops
MAX_TOKEN_LIMIT = 400

import random

def generate_dynamic_question(text_chunks):
    # Randomly pick a sentence from the text chunks and frame a question
    random_sentence = random.choice(text_chunks)
    return f"What is the significance of: '{random_sentence}'?"

def display_example_questions(dynamic_question):
    if not st.session_state['chat_history']:
        st.markdown("""
            <div class="question-box" id="question1">Was genau ist ein Belegarzt?</div>
            <div class="question-box" id="question2">Wofür wird die Alpha-ID verwendet?</div>
            <br>
            <div class="question-box" id="question3">Was sind die Vorteile des ambulanten operierens?</div>
            <div class="question-box" id="question4">AI Generated Question: {0}</div>
        """.format(dynamic_question), unsafe_allow_html=True)

def main():
    st.title("BinDocs Chat App")

    # Step 1: Adding CSS for rounded boxes
    st.markdown("""
        <style>
            .question-box {
                border: 1px solid orange;
                border-radius: 15px;
                padding: 10px;
                text-align: center;
                cursor: pointer;
                display: inline-block;
                width: 45%;
                margin: 2%;
            }
        </style>
    """, unsafe_allow_html=True)

    if "chat_history" not in st.session_state:
        st.session_state['chat_history'] = []

    display_chat_history(st.session_state['chat_history'])

    new_messages_placeholder = st.empty()

    pdf = st.file_uploader("Upload your PDF", type="pdf")

    if pdf is not None:
        query = st.text_input("Ask questions about your PDF file (in any preferred language):")

        if st.button("Ask") or (query and query != st.session_state.get('last_input', '')):
            st.session_state['last_input'] = query
            st.session_state['chat_history'].append(("User", query, "new"))

            loading_message = st.empty()
            loading_message.text('Bot is thinking...')
            
            VectorStore = load_pdf(pdf)
            max_tokens = 120
            chain = load_chatbot(max_tokens=max_tokens)
            docs = VectorStore.similarity_search(query=query, k=2)
            
            with get_openai_callback() as cb:
                response = chain.run(input_documents=docs, question=query)

            # Post-processing to remove incomplete sentences and redundant information
            filtered_response = remove_incomplete_sentences(response)
            filtered_response = remove_redundant_information(filtered_response)
            
            # Dynamic question generation and display example questions
            dynamic_question = generate_dynamic_question(chunks)
            display_example_questions(dynamic_question)

            st.session_state['chat_history'].append(("Bot", filtered_response, "new"))

            new_messages = st.session_state['chat_history'][-2:]
            for chat in new_messages:
                background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
                new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

            st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)

            loading_message.empty()

            query = ""

        st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
    
    # JavaScript for handling question box clicks
    st.markdown("""
        <script>
            document.getElementById('question1').onclick = function() {
                document.querySelector('input').value = 'Was genau ist ein Belegarzt?';
                document.querySelector('input').dispatchEvent(new Event('change'));
            };
            document.getElementById('question2').onclick = function() {
                document.querySelector('input').value = 'Wofür wird die Alpha-ID verwendet?';
                document.querySelector('input').dispatchEvent(new Event('change'));
            };
            document.getElementById('question3').onclick = function() {
                document.querySelector('input').value = 'Was sind die Vorteile des ambulanten operierens?';
                document.querySelector('input').dispatchEvent(new Event('change'));
            };
            document.getElementById('question4').onclick = function() {
                document.querySelector('input').value = 'AI Generated Question: {0}';
                document.querySelector('input').dispatchEvent(new Event('change'));
            };
        </script>
    """.format(dynamic_question), unsafe_allow_html=True)

if __name__ == "__main__":
    main()