File size: 6,068 Bytes
2df9243
44c0e78
9917a1a
9c5d83d
2b04423
7c95914
2df9243
 
7c95914
2df9243
7c95914
 
 
 
 
 
 
161353e
2df9243
 
7c95914
44c0e78
 
 
 
 
 
 
668775b
44c0e78
 
 
 
 
47f6195
 
7c95914
 
abd1f1b
 
976634b
d3de2d8
976634b
d3de2d8
976634b
 
a98948f
976634b
a98948f
2df9243
7c95914
 
 
 
 
 
 
f8dc8d1
a98948f
403a475
7c95914
f7d3ba1
7c95914
f7d3ba1
 
 
 
a63f1b5
2df9243
7c95914
 
 
 
2df9243
7c95914
 
 
 
f9dbffb
7c95914
a63f1b5
e38f084
5751fd4
ebf2d4c
c239258
9ca7d21
 
 
 
 
2df9243
 
 
 
 
 
 
 
 
 
 
 
 
2b04423
 
 
7c95914
 
 
 
 
 
 
 
9ca7d21
 
48a4a46
7c95914
2e38376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36c54d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c48ccc
 
36c54d4
4c48ccc
36c54d4
abd1f1b
0da8351
5616919
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os
from huggingface_hub import Repository
import streamlit.components.v1 as components
from datasets import load_dataset
import random
import pickle
from nltk.tokenize import sent_tokenize
import nltk
from PyPDF2 import PdfReader
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from my_component import my_component

nltk.download('punkt')

# Step 1: Clone the Dataset Repository
repo = Repository(
    local_dir="Private_Book",  # Local directory to clone the repository
    repo_type="dataset",  # Specify that this is a dataset repository
    
    clone_from="Anne31415/Private_Book",  # Replace with your repository URL
    
    token=os.environ["HUB_TOKEN"]  # Use the secret token to authenticate
)
repo.git_pull()  # Pull the latest changes (if any)

# Step 2: Load the PDF File
pdf_file_path = "Private_Book/Glossar_PDF_webscraping.pdf"  # Replace with your PDF file path


# Sidebar contents
with st.sidebar:
    st.title(':orange_book: BinDoc GmbH')
    

    api_key = os.getenv("OPENAI_API_KEY")
    # Retrieve the API key from st.secrets
    

    if not api_key:
        st.warning('API key is required to proceed.')
        st.stop()  # Stop the app if the API key is not provided

    st.markdown("Experience the future of document interaction with the revolutionary")
    st.markdown("**BinDocs Chat App**.")
    st.markdown("Harnessing the power of a Large Language Model and AI technology,")
    st.markdown("this innovative platform redefines PDF engagement,")
    st.markdown("enabling dynamic conversations that bridge the gap between")
    st.markdown("human and machine intelligence.")

    add_vertical_space(3)  # Add more vertical space between text blocks
    st.write('Made with ❤️ by BinDoc GmbH')

def load_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    chunks = []
    for page in pdf_reader.pages:
        text = page.extract_text()
        if text:
            chunks.append(text)
    
    store_name = os.path.basename(file_path)[:-4]
    
    if os.path.exists(f"{store_name}.pkl"):
        with open(f"{store_name}.pkl", "rb") as f:
            VectorStore = pickle.load(f)
    else:
        embeddings = OpenAIEmbeddings()
        VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
        with open(f"{store_name}.pkl", "wb") as f:
            pickle.dump(VectorStore, f)

    return VectorStore


def load_chatbot(max_tokens=300):
    return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff") 
    

def display_chat_history(chat_history):
    for chat in chat_history:
        background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
        st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

def remove_incomplete_sentences(text):
    sentences = sent_tokenize(text)
    complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
    return ' '.join(complete_sentences)

def remove_redundant_information(text):
    sentences = sent_tokenize(text)
    unique_sentences = list(set(sentences))
    return ' '.join(unique_sentences)

# Define a maximum token limit to avoid infinite loops
MAX_TOKEN_LIMIT = 400

import random


def main():
    st.title("BinDocs Chat App")

    if "chat_history" not in st.session_state:
        st.session_state['chat_history'] = []

    display_chat_history(st.session_state['chat_history'])

    new_messages_placeholder = st.empty()

    query = st.text_input("Ask questions about your PDF file (in any preferred language):")

    if st.button("Was genau ist ein Belegarzt?"):
        query = "Was genau ist ein Belegarzt?"
    if st.button("Wofür wird die Alpha-ID verwendet?"):
        query = "Wofür wird die Alpha-ID verwendet?"
    if st.button("Was sind die Vorteile des ambulanten operierens?"):
        query = "Was sind die Vorteile des ambulanten operierens?"

    if query:
        st.session_state['last_input'] = query
        st.session_state['chat_history'].append(("User", query, "new"))

        loading_message = st.empty()
        loading_message.text('Bot is thinking...')
        
        VectorStore = load_pdf(pdf_file_path)
        max_tokens = 120
        chain = load_chatbot(max_tokens=max_tokens)
        docs = VectorStore.similarity_search(query=query, k=2)
        
        with get_openai_callback() as cb:
            response = chain.run(input_documents=docs, question=query)

            # Post-processing to remove incomplete sentences and redundant information
            filtered_response = remove_incomplete_sentences(response)
            filtered_response = remove_redundant_information(filtered_response)
            
            st.session_state['chat_history'].append(("Bot", filtered_response, "new"))

            new_messages = st.session_state['chat_history'][-2:]
            for chat in new_messages:
                background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
                new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)

            st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)

            loading_message.empty()

            query = ""
    else:
        st.warning("Please enter a query before asking questions.")

    st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
    

if __name__ == "__main__":
    main()