File size: 5,727 Bytes
2df9243
9917a1a
9c5d83d
2b04423
7c95914
2df9243
 
7c95914
2df9243
7c95914
 
 
 
 
 
 
161353e
2df9243
 
7c95914
47f6195
 
7c95914
 
abd1f1b
 
976634b
d3de2d8
976634b
d3de2d8
976634b
 
a98948f
976634b
a98948f
2df9243
7c95914
 
 
 
 
 
 
f8dc8d1
a98948f
403a475
7c95914
f7d3ba1
7c95914
f7d3ba1
 
 
 
cab37f8
2df9243
7c95914
 
 
 
2df9243
7c95914
 
 
 
f9dbffb
7c95914
e38f084
5751fd4
ebf2d4c
c239258
9ca7d21
 
 
 
 
2df9243
 
 
 
 
 
 
 
 
 
 
 
 
2b04423
 
 
7c95914
 
 
 
 
 
 
 
9ca7d21
 
eb1ac12
 
48a4a46
7c95914
7ccf47a
 
 
 
 
a58a97b
7ccf47a
a58a97b
7ccf47a
 
 
2df9243
bcd8992
586a969
bcd8992
 
 
36c54d4
 
 
 
2df9243
36c54d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161353e
abd1f1b
36c54d4
0da8351
5616919
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import streamlit.components.v1 as components
from datasets import load_dataset
import random
import pickle
from nltk.tokenize import sent_tokenize
import nltk
from PyPDF2 import PdfReader
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from my_component import my_component

nltk.download('punkt')



# Sidebar contents
with st.sidebar:
    st.title(':orange_book: BinDoc GmbH')
    

    api_key = os.getenv("OPENAI_API_KEY")
    # Retrieve the API key from st.secrets
    

    if not api_key:
        st.warning('API key is required to proceed.')
        st.stop()  # Stop the app if the API key is not provided

    st.markdown("Experience the future of document interaction with the revolutionary")
    st.markdown("**BinDocs Chat App**.")
    st.markdown("Harnessing the power of a Large Language Model and AI technology,")
    st.markdown("this innovative platform redefines PDF engagement,")
    st.markdown("enabling dynamic conversations that bridge the gap between")
    st.markdown("human and machine intelligence.")

    add_vertical_space(3)  # Add more vertical space between text blocks
    st.write('Made with ❤️ by BinDoc GmbH')

def load_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    chunks = []
    for page in pdf_reader.pages:
        text = page.extract_text()
        if text:
            chunks.append(text)
    
    store_name = file_path.name[:-4]
    
    if os.path.exists(f"{store_name}.pkl"):
        with open(f"{store_name}.pkl", "rb") as f:
            VectorStore = pickle.load(f)
    else:
        embeddings = OpenAIEmbeddings()
        VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
        with open(f"{store_name}.pkl", "wb") as f:
            pickle.dump(VectorStore, f)

    return VectorStore

def load_chatbot(max_tokens=300):
    return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff") 
    

def display_chat_history(chat_history):
    for chat in chat_history:
        background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
        st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

def remove_incomplete_sentences(text):
    sentences = sent_tokenize(text)
    complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
    return ' '.join(complete_sentences)

def remove_redundant_information(text):
    sentences = sent_tokenize(text)
    unique_sentences = list(set(sentences))
    return ' '.join(unique_sentences)

# Define a maximum token limit to avoid infinite loops
MAX_TOKEN_LIMIT = 400

import random


def main():
    st.title("BinDocs Chat App")

    if "chat_history" not in st.session_state:
        st.session_state['chat_history'] = []

    display_chat_history(st.session_state['chat_history'])

    new_messages_placeholder = st.empty()

    pdf = st.file_uploader("Upload your PDF", type="pdf")

    query = st.text_input("Ask questions about your PDF file (in any preferred language):")

    if not pdf:
        st.warning("Please upload a PDF file to proceed.")
    else:
        if st.button("Was genau ist ein Belegarzt?"):
            query = "Was genau ist ein Belegarzt?"
        if st.button("Wofür wird die Alpha-ID verwendet?"):
            query = "Wofür wird die Alpha-ID verwendet?"
        if st.button("Was sind die Vorteile des ambulanten operierens?"):
            query = "Was sind die Vorteile des ambulanten operierens?"

        if query:
            st.session_state['last_input'] = query
            st.session_state['chat_history'].append(("User", query, "new"))

            loading_message = st.empty()
            loading_message.text('Bot is thinking...')
            
            VectorStore = load_pdf(pdf)
            max_tokens = 120
            chain = load_chatbot(max_tokens=max_tokens)
            docs = VectorStore.similarity_search(query=query, k=2)
            
            with get_openai_callback() as cb:
                response = chain.run(input_documents=docs, question=query)

            # Post-processing to remove incomplete sentences and redundant information
            filtered_response = remove_incomplete_sentences(response)
            filtered_response = remove_redundant_information(filtered_response)
            
            st.session_state['chat_history'].append(("Bot", filtered_response, "new"))

            new_messages = st.session_state['chat_history'][-2:]
            for chat in new_messages:
                background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
                new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

            st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)

            loading_message.empty()

            query = ""
        else:
            st.warning("Please upload a PDF file before asking questions.")

        st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
    
    
    my_component()


if __name__ == "__main__":
    main()